diff options
Diffstat (limited to 'include')
315 files changed, 65580 insertions, 0 deletions
diff --git a/include/haproxy/acl-t.h b/include/haproxy/acl-t.h new file mode 100644 index 0000000..34b7e40 --- /dev/null +++ b/include/haproxy/acl-t.h @@ -0,0 +1,160 @@ +/* + * include/haproxy/acl-t.h + * This file provides structures and types for ACLs. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ACL_T_H +#define _HAPROXY_ACL_T_H + +#include <haproxy/pattern-t.h> +#include <haproxy/sample-t.h> + +/* ACL test result. + * + * We're using a 3-state matching system : + * - PASS : at least one pattern already matches + * - MISS : some data is missing to decide if some rules may finally match. + * - FAIL : no pattern may ever match + * + * We assign values 0, 1 and 3 to FAIL, MISS and PASS respectively, so that we + * can make use of standard arithmetic for the truth tables below : + * + * x | !x x&y | F(0) | M(1) | P(3) x|y | F(0) | M(1) | P(3) + * ------+----- -----+------+------+----- -----+------+------+----- + * F(0) | P(3) F(0)| F(0) | F(0) | F(0) F(0)| F(0) | M(1) | P(3) + * M(1) | M(1) M(1)| F(0) | M(1) | M(1) M(1)| M(1) | M(1) | P(3) + * P(3) | F(0) P(3)| F(0) | M(1) | P(3) P(3)| P(3) | P(3) | P(3) + * + * neg(x) = (3 >> x) and(x,y) = (x & y) or(x,y) = (x | y) + * + * For efficiency, the ACL return flags are directly mapped from the pattern + * match flags. See include/pattern.h for existing values. + */ +enum acl_test_res { + ACL_TEST_FAIL = 0, /* test failed */ + ACL_TEST_MISS = 1, /* test may pass with more info */ + ACL_TEST_PASS = 3, /* test passed */ +}; + +/* Condition polarity. It makes it easier for any option to choose between + * IF/UNLESS if it can store that information within the condition itself. + * Those should be interpreted as "IF/UNLESS result == PASS". + */ +enum acl_cond_pol { + ACL_COND_NONE, /* no polarity set yet */ + ACL_COND_IF, /* positive condition (after 'if') */ + ACL_COND_UNLESS, /* negative condition (after 'unless') */ +}; + +/* + * ACL keyword: Associates keywords with parsers, methods to retrieve the value and testers. + */ +/* + * NOTE: + * The 'parse' function is called to parse words in the configuration. It must + * return the number of valid words read. 0 = error. The 'opaque' argument may + * be used by functions which need to maintain a context between consecutive + * values. It is initialized to zero before the first call, and passed along + * successive calls. + */ + +struct acl_expr; +struct acl_keyword { + const char *kw; + char *fetch_kw; + int match_type; /* Contain PAT_MATCH_* */ + int (*parse)(const char *text, struct pattern *pattern, int flags, char **err); + int (*index)(struct pattern_expr *expr, struct pattern *pattern, char **err); + void (*delete)(struct pat_ref *, struct pat_ref_elt *); + void (*prune)(struct pattern_expr *expr); + struct pattern *(*match)(struct sample *smp, struct pattern_expr *expr, int fill); + /* must be after the config params */ + struct sample_fetch *smp; /* the sample fetch we depend on */ +}; + +/* + * A keyword list. It is a NULL-terminated array of keywords. It embeds a + * struct list in order to be linked to other lists, allowing it to easily + * be declared where it is needed, and linked without duplicating data nor + * allocating memory. + */ +struct acl_kw_list { + struct list list; + struct acl_keyword kw[VAR_ARRAY]; +}; + +/* + * Description of an ACL expression. + * The expression is part of a list. It contains pointers to the keyword, the + * sample fetch descriptor which defaults to the keyword's, and the associated + * pattern matching. The structure is organized so that the hot parts are + * grouped together in order to optimize caching. + */ +struct acl_expr { + struct sample_expr *smp; /* the sample expression we depend on */ + struct pattern_head pat; /* the pattern matching expression */ + struct list list; /* chaining */ + const char *kw; /* points to the ACL kw's name or fetch's name (must not free) */ +}; + +/* The acl will be linked to from the proxy where it is declared */ +struct acl { + struct list list; /* chaining */ + char *name; /* acl name */ + struct list expr; /* list of acl_exprs */ + unsigned int use; /* or'ed bit mask of all acl_expr's SMP_USE_* */ + unsigned int val; /* or'ed bit mask of all acl_expr's SMP_VAL_* */ +}; + +/* the condition will be linked to from an action in a proxy */ +struct acl_term { + struct list list; /* chaining */ + struct acl *acl; /* acl pointed to by this term */ + int neg; /* 1 if the ACL result must be negated */ +}; + +struct acl_term_suite { + struct list list; /* chaining of term suites */ + struct list terms; /* list of acl_terms */ +}; + +struct acl_cond { + struct list list; /* Some specific tests may use multiple conditions */ + struct list suites; /* list of acl_term_suites */ + enum acl_cond_pol pol; /* polarity: ACL_COND_IF / ACL_COND_UNLESS */ + unsigned int use; /* or'ed bit mask of all suites's SMP_USE_* */ + unsigned int val; /* or'ed bit mask of all suites's SMP_VAL_* */ + const char *file; /* config file where the condition is declared */ + int line; /* line in the config file where the condition is declared */ +}; + +struct acl_sample { + struct acl_cond cond; + struct acl_term_suite suite; + struct acl_term terms[]; +}; + +#endif /* _HAPROXY_ACL_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/acl.h b/include/haproxy/acl.h new file mode 100644 index 0000000..38b1739 --- /dev/null +++ b/include/haproxy/acl.h @@ -0,0 +1,157 @@ +/* + * include/haproxy/acl.h + * This file provides interface definitions for ACL manipulation. + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ACL_H +#define _HAPROXY_ACL_H + +#include <haproxy/acl-t.h> +#include <haproxy/api.h> +#include <haproxy/arg-t.h> + +struct stream; + +/* + * FIXME: we need destructor functions too ! + */ + +/* Negate an acl result. This turns (ACL_MATCH_FAIL, ACL_MATCH_MISS, + * ACL_MATCH_PASS) into (ACL_MATCH_PASS, ACL_MATCH_MISS, ACL_MATCH_FAIL). + */ +static inline enum acl_test_res acl_neg(enum acl_test_res res) +{ + return (3 >> res); +} + +/* Convert an acl result to a boolean. Only ACL_MATCH_PASS returns 1. */ +static inline int acl_pass(enum acl_test_res res) +{ + return (res >> 1); +} + +/* Return a pointer to the ACL <name> within the list starting at <head>, or + * NULL if not found. + */ +struct acl *find_acl_by_name(const char *name, struct list *head); + +/* Return a pointer to the ACL keyword <kw> within the list starting at <head>, + * or NULL if not found. Note that if <kw> contains an opening parenthesis, + * only the left part of it is checked. + */ +struct acl_keyword *find_acl_kw(const char *kw); + +/* Parse an ACL expression starting at <args>[0], and return it. + * Right now, the only accepted syntax is : + * <subject> [<value>...] + */ +struct acl_expr *parse_acl_expr(const char **args, char **err, struct arg_list *al, const char *file, int line); + +/* Purge everything in the acl <acl>, then return <acl>. */ +struct acl *prune_acl(struct acl *acl); + +/* Parse an ACL with the name starting at <args>[0], and with a list of already + * known ACLs in <acl>. If the ACL was not in the list, it will be added. + * A pointer to that ACL is returned. + * + * args syntax: <aclname> <acl_expr> + */ +struct acl *parse_acl(const char **args, struct list *known_acl, char **err, struct arg_list *al, const char *file, int line); + +/* Parse an ACL condition starting at <args>[0], relying on a list of already + * known ACLs passed in <known_acl>. The new condition is returned (or NULL in + * case of low memory). Supports multiple conditions separated by "or". + */ +struct acl_cond *parse_acl_cond(const char **args, struct list *known_acl, + enum acl_cond_pol pol, char **err, struct arg_list *al, + const char *file, int line); + +/* Builds an ACL condition starting at the if/unless keyword. The complete + * condition is returned. NULL is returned in case of error or if the first + * word is neither "if" nor "unless". It automatically sets the file name and + * the line number in the condition for better error reporting, and sets the + * HTTP initialization requirements in the proxy. If <err> is not NULL, it will + * be set to an error message upon errors, that the caller will have to free. + */ +struct acl_cond *build_acl_cond(const char *file, int line, struct list *known_acl, + struct proxy *px, const char **args, char **err); + +/* Execute condition <cond> and return either ACL_TEST_FAIL, ACL_TEST_MISS or + * ACL_TEST_PASS depending on the test results. ACL_TEST_MISS may only be + * returned if <opt> does not contain SMP_OPT_FINAL, indicating that incomplete + * data is being examined. The function automatically sets SMP_OPT_ITERATE. This + * function only computes the condition, it does not apply the polarity required + * by IF/UNLESS, it's up to the caller to do this. + */ +enum acl_test_res acl_exec_cond(struct acl_cond *cond, struct proxy *px, struct session *sess, struct stream *strm, unsigned int opt); + +/* Returns a pointer to the first ACL conflicting with usage at place <where> + * which is one of the SMP_VAL_* bits indicating a check place, or NULL if + * no conflict is found. Only full conflicts are detected (ACL is not usable). + * Use the next function to check for useless keywords. + */ +const struct acl *acl_cond_conflicts(const struct acl_cond *cond, unsigned int where); + +/* Returns a pointer to the first ACL and its first keyword to conflict with + * usage at place <where> which is one of the SMP_VAL_* bits indicating a check + * place. Returns true if a conflict is found, with <acl> and <kw> set (if non + * null), or false if not conflict is found. The first useless keyword is + * returned. + */ +int acl_cond_kw_conflicts(const struct acl_cond *cond, unsigned int where, struct acl const **acl, char const **kw); + +/* + * Find targets for userlist and groups in acl. Function returns the number + * of errors or OK if everything is fine. + */ +int acl_find_targets(struct proxy *p); + +/* Return a pointer to the ACL <name> within the list starting at <head>, or + * NULL if not found. + */ +struct acl *find_acl_by_name(const char *name, struct list *head); + +/* + * Registers the ACL keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void acl_register_keywords(struct acl_kw_list *kwl); + +/* + * Unregisters the ACL keyword list <kwl> from the list of valid keywords. + */ +void acl_unregister_keywords(struct acl_kw_list *kwl); + +/* initializes ACLs by resolving the sample fetch names they rely upon. + * Returns 0 on success, otherwise an error. + */ +int init_acl(void); + +void acl_dump_kwd(void); + +void free_acl_cond(struct acl_cond *cond); + +#endif /* _HAPROXY_ACL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/action-t.h b/include/haproxy/action-t.h new file mode 100644 index 0000000..f77bdce --- /dev/null +++ b/include/haproxy/action-t.h @@ -0,0 +1,217 @@ +/* + * include/haproxy/action-t.h + * This file contains actions definitions. + * + * Copyright (C) 2000-2010 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ACTION_T_H +#define _HAPROXY_ACTION_T_H + +#include <haproxy/applet-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/vars-t.h> + +struct session; +struct stream; +struct proxy; + +enum act_from { + ACT_F_TCP_REQ_CON, /* tcp-request connection */ + ACT_F_TCP_REQ_SES, /* tcp-request session */ + ACT_F_TCP_REQ_CNT, /* tcp-request content */ + ACT_F_TCP_RES_CNT, /* tcp-response content */ + ACT_F_HTTP_REQ, /* http-request */ + ACT_F_HTTP_RES, /* http-response */ + ACT_F_TCP_CHK, /* tcp-check. */ + ACT_F_CFG_PARSER, /* config parser */ + ACT_F_CLI_PARSER, /* command line parser */ +}; + +enum act_return { + ACT_RET_CONT, /* continue processing. */ + ACT_RET_STOP, /* stop processing. */ + ACT_RET_YIELD, /* call me again. */ + ACT_RET_ERR, /* internal processing error. */ + ACT_RET_DONE, /* processing done, stop processing */ + ACT_RET_DENY, /* deny, must be handled by the caller */ + ACT_RET_ABRT, /* abort, handled by action itsleft. */ + ACT_RET_INV, /* invalid request/response */ +}; + +enum act_parse_ret { + ACT_RET_PRS_OK, /* continue processing. */ + ACT_RET_PRS_ERR, /* abort processing. */ +}; + +/* Option flags passed to custom actions */ +enum act_opt { + ACT_OPT_NONE = 0x00000000, /* no flag */ + ACT_OPT_FINAL = 0x00000001, /* last call, cannot yield */ + ACT_OPT_FIRST = 0x00000002, /* first call for this action */ +}; + +/* Flags used to describe the action. */ +enum act_flag { + ACT_FLAG_FINAL = 1 << 0, /* the action stops the rules evaluation when executed */ +}; + + +/* known actions to be used without any action function pointer. This enum is + * typically used in a switch case, if and only if .action_ptr is undefined. So + * if an action function is defined for one of following action types, the + * function have the priority over the switch. + */ +enum act_name { + ACT_CUSTOM = 0, + + /* common action */ + ACT_ACTION_ALLOW, + ACT_ACTION_DENY, + + /* common http actions .*/ + ACT_HTTP_REDIR, + + /* http request actions. */ + ACT_HTTP_REQ_TARPIT, + + /* tcp actions */ + ACT_TCP_EXPECT_PX, + ACT_TCP_EXPECT_CIP, + ACT_TCP_CLOSE, /* close at the sender's */ +}; + +/* Timeout name valid for a set-timeout rule */ +enum act_timeout_name { + ACT_TIMEOUT_SERVER, + ACT_TIMEOUT_TUNNEL, + ACT_TIMEOUT_CLIENT, +}; + +enum act_normalize_uri { + ACT_NORMALIZE_URI_PATH_MERGE_SLASHES, + ACT_NORMALIZE_URI_PATH_STRIP_DOT, + ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT, + ACT_NORMALIZE_URI_PATH_STRIP_DOTDOT_FULL, + ACT_NORMALIZE_URI_QUERY_SORT_BY_NAME, + ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE, + ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT, + ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED, + ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT, + ACT_NORMALIZE_URI_FRAGMENT_STRIP, + ACT_NORMALIZE_URI_FRAGMENT_ENCODE, +}; + +/* NOTE: if <.action_ptr> is defined, the referenced function will always be + * called regardless the action type. */ +struct act_rule { + struct list list; + struct acl_cond *cond; /* acl condition to meet */ + unsigned int action; /* ACT_* or any meaningful value if action_ptr is defined */ + unsigned int flags; /* ACT_FLAG_* */ + enum act_from from; /* ACT_F_* */ + enum act_return (*action_ptr)(struct act_rule *rule, struct proxy *px, /* ptr to custom action */ + struct session *sess, struct stream *s, int opts); + int (*check_ptr)(struct act_rule *rule, struct proxy *px, char **err); /* ptr to check function */ + void (*release_ptr)(struct act_rule *rule); /* ptr to release function */ + const struct action_kw *kw; + struct applet applet; /* used for the applet registration. */ + union { + struct { + struct sample_expr *expr; + char *varname; + char *resolvers_id; + struct resolvers *resolvers; + struct resolv_options *opts; + } resolv; /* resolving */ + struct { + int i; /* integer param (status, nice, loglevel, ..) */ + struct ist str; /* string param (reason, header name, ...) */ + struct list fmt; /* log-format compatible expression */ + struct my_regex *re; /* used by replace-header/value/uri/path */ + } http; /* args used by some HTTP rules */ + struct http_reply *http_reply; /* HTTP response to be used by return/deny/tarpit rules */ + struct redirect_rule *redir; /* redirect rule or "http-request redirect" */ + struct { + char *ref; /* MAP or ACL file name to update */ + struct list key; /* pattern to retrieve MAP or ACL key */ + struct list value; /* pattern to retrieve MAP value */ + } map; + struct sample_expr *expr; + struct { + struct sample_expr *expr; /* expression used as the key */ + struct cap_hdr *hdr; /* the capture storage */ + } cap; + struct { + struct sample_expr *expr; + int idx; + } capid; + struct { + int value; /* plain timeout value in ms if no expr is used */ + enum act_timeout_name type; /* timeout type */ + struct sample_expr *expr; /* timeout value as an expression */ + } timeout; + struct hlua_rule *hlua_rule; + struct { + struct list fmt; /* log-format compatible expression */ + struct sample_expr *expr; + uint64_t name_hash; + enum vars_scope scope; + uint conditions; /* Bitfield of the conditions passed to this set-var call */ + } vars; + struct { + int sc; + unsigned int idx; + long long int value; + struct sample_expr *expr; + } gpc; + struct { + int sc; + unsigned int idx; + long long int value; + struct sample_expr *expr; + } gpt; + struct track_ctr_prm trk_ctr; + struct { + char *srvname; /* server name from config parsing. */ + struct server *srv; /* target server to attach the connection */ + struct sample_expr *name; /* used to differentiate idle connections */ + } attach_srv; /* 'attach-srv' rule */ + struct { + void *p[4]; + } act; /* generic pointers to be used by custom actions */ + } arg; /* arguments used by some actions */ + struct { + char *file; /* file name where the rule appears (or NULL) */ + int line; /* line number where the rule appears */ + } conf; +}; + +struct action_kw { + const char *kw; + enum act_parse_ret (*parse)(const char **args, int *cur_arg, struct proxy *px, + struct act_rule *rule, char **err); + int flags; + void *private; +}; + +struct action_kw_list { + struct list list; + struct action_kw kw[VAR_ARRAY]; +}; + +#endif /* _HAPROXY_ACTION_T_H */ diff --git a/include/haproxy/action.h b/include/haproxy/action.h new file mode 100644 index 0000000..dba1408 --- /dev/null +++ b/include/haproxy/action.h @@ -0,0 +1,124 @@ +/* + * include/haproxy/action.h + * This file contains actions prototypes. + * + * Copyright (C) 2000-2010 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ACTION_H +#define _HAPROXY_ACTION_H + +#include <stdio.h> +#include <haproxy/action-t.h> +#include <haproxy/cfgparse.h> +#include <haproxy/list.h> +#include <haproxy/sample.h> + +struct resolv_requester; +struct dns_counters; + +int act_resolution_cb(struct resolv_requester *requester, struct dns_counters *counters); +int act_resolution_error_cb(struct resolv_requester *requester, int error_code); +const char *action_suggest(const char *word, const struct list *keywords, const char **extra); +void free_act_rule(struct act_rule *rule); + +static inline struct action_kw *action_lookup(struct list *keywords, const char *kw) +{ + struct action_kw_list *kw_list; + struct action_kw *best = NULL; + int len, bestlen = 0; + int i; + + if (LIST_ISEMPTY(keywords)) + return NULL; + + list_for_each_entry(kw_list, keywords, list) { + for (i = 0; kw_list->kw[i].kw != NULL; i++) { + if ((kw_list->kw[i].flags & KWF_MATCH_PREFIX) && + (len = strlen(kw_list->kw[i].kw)) > bestlen && + strncmp(kw, kw_list->kw[i].kw, len) == 0) { + if (len > bestlen) { + bestlen = len; + best = &kw_list->kw[i]; + } + } + if (strcmp(kw, kw_list->kw[i].kw) == 0) + return &kw_list->kw[i]; + } + } + return best; +} + +static inline void action_build_list(struct list *keywords, + struct buffer *chk) +{ + struct action_kw_list *kw_list; + int i; + char *p; + char *end; + int l; + + p = chk->area; + end = p + chk->size - 1; + list_for_each_entry(kw_list, keywords, list) { + for (i = 0; kw_list->kw[i].kw != NULL; i++) { + l = snprintf(p, end - p, "'%s%s', ", kw_list->kw[i].kw, (kw_list->kw[i].flags & KWF_MATCH_PREFIX) ? "(*)" : ""); + if (l > end - p) + continue; + p += l; + } + } + if (p > chk->area) + *(p-2) = '\0'; + else + *p = '\0'; +} + +/* Check an action ruleset validity. It returns the number of error encountered + * and err_code is updated if a warning is emitted. + */ +int check_action_rules(struct list *rules, struct proxy *px, int *err_code); + +/* Find and check the target table used by an action track-sc*. This + * function should be called during the configuration validity check. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int check_trk_action(struct act_rule *rule, struct proxy *px, char **err); + +/* check a capture rule. This function should be called during the configuration + * validity check. + * + * The function returns 1 in success case, otherwise, it returns 0 and err is + * filled. + */ +int check_capture(struct act_rule *rule, struct proxy *px, char **err); + +int cfg_parse_rule_set_timeout(const char **args, int idx, struct act_rule *rule, + struct proxy *px, char **err); + +static inline void release_timeout_action(struct act_rule *rule) +{ + release_sample_expr(rule->arg.timeout.expr); +} + +struct act_rule *new_act_rule(enum act_from from, const char *file, int linenum); +void free_act_rules(struct list *rules); +void dump_act_rules(const struct list *rules, const char *pfx); + +#endif /* _HAPROXY_ACTION_H */ diff --git a/include/haproxy/activity-t.h b/include/haproxy/activity-t.h new file mode 100644 index 0000000..9faeecd --- /dev/null +++ b/include/haproxy/activity-t.h @@ -0,0 +1,144 @@ +/* + * include/haproxy/activity-t.h + * This file contains structure declarations for activity measurements. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ACTIVITY_T_H +#define _HAPROXY_ACTIVITY_T_H + +#include <haproxy/api-t.h> +#include <haproxy/freq_ctr-t.h> + +/* bit fields for the "profiling" global variable */ +#define HA_PROF_TASKS_OFF 0x00000000 /* per-task CPU profiling forced disabled */ +#define HA_PROF_TASKS_AOFF 0x00000001 /* per-task CPU profiling off (automatic) */ +#define HA_PROF_TASKS_AON 0x00000002 /* per-task CPU profiling on (automatic) */ +#define HA_PROF_TASKS_ON 0x00000003 /* per-task CPU profiling forced enabled */ +#define HA_PROF_TASKS_MASK 0x00000003 /* per-task CPU profiling mask */ + +#define HA_PROF_MEMORY 0x00000004 /* memory profiling */ + + +#ifdef USE_MEMORY_PROFILING +/* Elements used by memory profiling. This determines the number of buckets to + * store stats. + */ +#define MEMPROF_HASH_BITS 10 +#define MEMPROF_HASH_BUCKETS (1U << MEMPROF_HASH_BITS) + +enum memprof_method { + MEMPROF_METH_UNKNOWN = 0, + MEMPROF_METH_MALLOC, + MEMPROF_METH_CALLOC, + MEMPROF_METH_REALLOC, + MEMPROF_METH_FREE, + MEMPROF_METH_P_ALLOC, // pool_alloc() + MEMPROF_METH_P_FREE, // pool_free() + MEMPROF_METH_METHODS /* count, must be last */ +}; + +/* stats: + * - malloc increases alloc + * - free increases free (if non null) + * - realloc increases either depending on the size change. + * when the real size is known (malloc_usable_size()), it's used in free_tot + * and alloc_tot, otherwise the requested size is reported in alloc_tot and + * zero in free_tot. + */ +struct memprof_stats { + const void *caller; + enum memprof_method method; + /* 4-7 bytes hole here */ + unsigned long long alloc_calls; + unsigned long long free_calls; + unsigned long long alloc_tot; + unsigned long long free_tot; + void *info; // for pools, ptr to the pool + void *pad; // pad to 64 +}; +#endif + +/* per-thread activity reports. It's important that it's aligned on cache lines + * because some elements will be updated very often. Most counters are OK on + * 32-bit since this will be used during debugging sessions for troubleshooting + * in iterative mode. + */ +struct activity { + unsigned int loops; // complete loops in run_poll_loop() + unsigned int wake_tasks; // active tasks prevented poll() from sleeping + unsigned int wake_signal; // pending signal prevented poll() from sleeping + unsigned int poll_io; // number of times poll() reported I/O events + unsigned int poll_exp; // number of times poll() sees an expired timeout (includes wake_*) + unsigned int poll_drop_fd; // poller dropped a dead FD from the update list + unsigned int poll_skip_fd; // poller skipped another thread's FD + unsigned int conn_dead; // conn_fd_handler woke up on an FD indicating a dead connection + unsigned int stream_calls; // calls to process_stream() + unsigned int ctxsw; // total number of context switches + unsigned int tasksw; // total number of task switches + unsigned int empty_rq; // calls to process_runnable_tasks() with nothing for the thread + unsigned int long_rq; // process_runnable_tasks() left with tasks in the run queue + unsigned int cpust_total; // sum of half-ms stolen per thread + unsigned int fd_takeover; // number of times this thread stole another one's FD + unsigned int check_adopted;// number of times a check was migrated to this thread + ALWAYS_ALIGN(64); + + struct freq_ctr cpust_1s; // avg amount of half-ms stolen over last second + struct freq_ctr cpust_15s; // avg amount of half-ms stolen over last 15s + unsigned int avg_loop_us; // average run time per loop over last 1024 runs + unsigned int accepted; // accepted incoming connections + unsigned int accq_pushed; // accept queue connections pushed + unsigned int accq_full; // accept queue connection not pushed because full + unsigned int pool_fail; // failed a pool allocation + unsigned int buf_wait; // waited on a buffer allocation + unsigned int check_started;// number of times a check was started on this thread +#if defined(DEBUG_DEV) + /* keep these ones at the end */ + unsigned int ctr0; // general purposee debug counter + unsigned int ctr1; // general purposee debug counter + unsigned int ctr2; // general purposee debug counter +#endif + char __pad[0]; // unused except to check remaining room + char __end[0] __attribute__((aligned(64))); // align size to 64. +}; + +/* 256 entries for callers * callees should be highly sufficient (~45 seen usually) */ +#define SCHED_ACT_HASH_BITS 8 +#define SCHED_ACT_HASH_BUCKETS (1U << SCHED_ACT_HASH_BITS) + +/* global profiling stats from the scheduler: each entry corresponds to a + * task or tasklet ->process function pointer, with a number of calls and + * a total time. Each entry is unique, except entry 0 which is for colliding + * hashes (i.e. others). All of these must be accessed atomically. + */ +struct sched_activity { + const void *func; + const struct ha_caller *caller; + uint64_t calls; + uint64_t cpu_time; + uint64_t lat_time; +}; + +#endif /* _HAPROXY_ACTIVITY_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/activity.h b/include/haproxy/activity.h new file mode 100644 index 0000000..dbc8ec3 --- /dev/null +++ b/include/haproxy/activity.h @@ -0,0 +1,47 @@ +/* + * include/haproxy/activity.h + * This file contains macros and inline functions for activity measurements. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ACTIVITY_H +#define _HAPROXY_ACTIVITY_H + +#include <haproxy/activity-t.h> +#include <haproxy/api.h> + +extern unsigned int profiling; +extern struct activity activity[MAX_THREADS]; +extern struct sched_activity sched_activity[SCHED_ACT_HASH_BUCKETS]; + +void report_stolen_time(uint64_t stolen); +void activity_count_runtime(uint32_t run_time); +struct sched_activity *sched_activity_entry(struct sched_activity *array, const void *func, const void *caller); + +#ifdef USE_MEMORY_PROFILING +struct memprof_stats *memprof_get_bin(const void *ra, enum memprof_method meth); +#endif + +#endif /* _HAPROXY_ACTIVITY_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/api-t.h b/include/haproxy/api-t.h new file mode 100644 index 0000000..edb33a8 --- /dev/null +++ b/include/haproxy/api-t.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/api-t.h + * This provides definitions for all common types or type modifiers used + * everywhere in the code, and suitable for use in structure fields. + * + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_TYPES_H +#define _HAPROXY_TYPES_H + +#include <inttypes.h> +#include <stddef.h> + +#include <haproxy/compat.h> +#include <haproxy/compiler.h> +#include <haproxy/defaults.h> +#include <haproxy/list-t.h> + +#endif /* _HAPROXY_TYPES_H */ diff --git a/include/haproxy/api.h b/include/haproxy/api.h new file mode 100644 index 0000000..a0bb6a8 --- /dev/null +++ b/include/haproxy/api.h @@ -0,0 +1,38 @@ +/* + * include/haproxy/api.h + * + * Include wrapper that assembles all includes required by every haproxy file. + * Please do not add direct definitions into this file. + * + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_BASE_H +#define _HAPROXY_BASE_H + +#include <haproxy/api-t.h> +#include <haproxy/atomic.h> +#include <haproxy/bug.h> +#include <haproxy/init.h> + +#endif diff --git a/include/haproxy/applet-t.h b/include/haproxy/applet-t.h new file mode 100644 index 0000000..bd96403 --- /dev/null +++ b/include/haproxy/applet-t.h @@ -0,0 +1,101 @@ +/* + * include/haproxy/applet-t.h + * This file describes the applet struct and associated constants. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_APPLET_T_H +#define _HAPROXY_APPLET_T_H + +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/dynbuf-t.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/xref-t.h> + +/* flags for appctx->state */ +#define APPLET_WANT_DIE 0x01 /* applet was running and requested to die */ + +/* Room for per-command context (mostly CLI commands but not only) */ +#define APPLET_MAX_SVCCTX 88 + +struct appctx; +struct proxy; +struct stconn; +struct sedesc; +struct session; + +/* Applet descriptor */ +struct applet { + enum obj_type obj_type; /* object type = OBJ_TYPE_APPLET */ + /* 3 unused bytes here */ + char *name; /* applet's name to report in logs */ + int (*init)(struct appctx *); /* callback to init resources, may be NULL. + expect 0 if ok, -1 if an error occurs. */ + void (*fct)(struct appctx *); /* internal I/O handler, may never be NULL */ + void (*release)(struct appctx *); /* callback to release resources, may be NULL */ + unsigned int timeout; /* execution timeout. */ +}; + +/* Context of a running applet. */ +struct appctx { + enum obj_type obj_type; /* OBJ_TYPE_APPCTX */ + /* 3 unused bytes here */ + unsigned short state; /* Internal appctx state */ + unsigned int st0; /* CLI state for stats, session state for peers */ + unsigned int st1; /* prompt/payload (bitwise OR of APPCTX_CLI_ST1_*) for stats, session error for peers */ + struct buffer *chunk; /* used to store unfinished commands */ + struct applet *applet; /* applet this context refers to */ + struct session *sess; /* session for frontend applets (NULL for backend applets) */ + struct sedesc *sedesc; /* stream endpoint descriptor the applet is attached to */ + struct act_rule *rule; /* rule associated with the applet. */ + int (*io_handler)(struct appctx *appctx); /* used within the cli_io_handler when st0 = CLI_ST_CALLBACK */ + void (*io_release)(struct appctx *appctx); /* used within the cli_io_handler when st0 = CLI_ST_CALLBACK, + if the command is terminated or the session released */ + int cli_severity_output; /* used within the cli_io_handler to format severity output of informational feedback */ + int cli_level; /* the level of CLI which can be lowered dynamically */ + char cli_payload_pat[8]; /* Payload pattern */ + uint32_t cli_anon_key; /* the key to anonymise with the hash in cli */ + struct buffer_wait buffer_wait; /* position in the list of objects waiting for a buffer */ + struct task *t; /* task associated to the applet */ + struct freq_ctr call_rate; /* appctx call rate */ + struct list wait_entry; /* entry in a list of waiters for an event (e.g. ring events) */ + + /* The pointer seen by application code is appctx->svcctx. In 2.7 the + * anonymous union and the "ctx" struct disappeared, and the struct + * "svc" became svc_storage, which is never accessed directly by + * application code. Look at "show fd" for an example. + */ + + /* here we have the service's context (CLI command, applet, etc) */ + void *svcctx; /* pointer to a context used by the command, e.g. <storage> below */ + struct { + void *shadow; /* shadow of svcctx above, do not use! */ + char storage[APPLET_MAX_SVCCTX]; /* storage of svcctx above */ + } svc; /* generic storage for most commands */ +}; + +#endif /* _HAPROXY_APPLET_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/applet.h b/include/haproxy/applet.h new file mode 100644 index 0000000..b04ffd9 --- /dev/null +++ b/include/haproxy/applet.h @@ -0,0 +1,270 @@ +/* + * include/haproxy/applet.h + * This file contains applet function prototypes + * + * Copyright (C) 2000-2015 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_APPLET_H +#define _HAPROXY_APPLET_H + +#include <stdlib.h> + +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/channel.h> +#include <haproxy/list.h> +#include <haproxy/pool.h> +#include <haproxy/sc_strm.h> +#include <haproxy/session.h> +#include <haproxy/stconn.h> +#include <haproxy/task.h> + +extern unsigned int nb_applets; +extern struct pool_head *pool_head_appctx; + +struct task *task_run_applet(struct task *t, void *context, unsigned int state); +int appctx_buf_available(void *arg); +void *applet_reserve_svcctx(struct appctx *appctx, size_t size); +void applet_reset_svcctx(struct appctx *appctx); +void appctx_shut(struct appctx *appctx); + +struct appctx *appctx_new_on(struct applet *applet, struct sedesc *sedesc, int thr); +int appctx_finalize_startup(struct appctx *appctx, struct proxy *px, struct buffer *input); +void appctx_free_on_early_error(struct appctx *appctx); +void appctx_free(struct appctx *appctx); + +static inline struct appctx *appctx_new_here(struct applet *applet, struct sedesc *sedesc) +{ + return appctx_new_on(applet, sedesc, tid); +} + +static inline struct appctx *appctx_new_anywhere(struct applet *applet, struct sedesc *sedesc) +{ + return appctx_new_on(applet, sedesc, -1); +} + +/* Helper function to call .init applet callback function, if it exists. Returns 0 + * on success and -1 on error. + */ +static inline int appctx_init(struct appctx *appctx) +{ + /* Set appctx affinity to the current thread. Because, after this call, + * the appctx will be fully initialized. The session and the stream will + * eventually be created. The affinity must be set now ! + */ + BUG_ON(appctx->t->tid != tid); + task_set_thread(appctx->t, tid); + + if (appctx->applet->init) + return appctx->applet->init(appctx); + return 0; +} + +/* Releases an appctx previously allocated by appctx_new(). */ +static inline void __appctx_free(struct appctx *appctx) +{ + task_destroy(appctx->t); + if (LIST_INLIST(&appctx->buffer_wait.list)) + LIST_DEL_INIT(&appctx->buffer_wait.list); + if (appctx->sess) + session_free(appctx->sess); + BUG_ON(appctx->sedesc && !se_fl_test(appctx->sedesc, SE_FL_ORPHAN)); + sedesc_free(appctx->sedesc); + pool_free(pool_head_appctx, appctx); + _HA_ATOMIC_DEC(&nb_applets); +} + +/* wakes up an applet when conditions have changed. We're using a macro here in + * order to retrieve the caller's place. + */ +#define appctx_wakeup(ctx) \ + _task_wakeup((ctx)->t, TASK_WOKEN_OTHER, MK_CALLER(WAKEUP_TYPE_APPCTX_WAKEUP, 0, 0)) + +/* returns the stream connector the appctx is attached to, via the sedesc */ +static inline struct stconn *appctx_sc(const struct appctx *appctx) +{ + return appctx->sedesc->sc; +} + +/* returns the stream the appctx is attached to. Note that a stream *must* + * be attached, as we use an unchecked dereference via __sc_strm(). + */ +static inline struct stream *appctx_strm(const struct appctx *appctx) +{ + return __sc_strm(appctx->sedesc->sc); +} + +/* The applet announces it has more data to deliver to the stream's input + * buffer. + */ +static inline void applet_have_more_data(struct appctx *appctx) +{ + se_fl_clr(appctx->sedesc, SE_FL_HAVE_NO_DATA); +} + +/* The applet announces it doesn't have more data for the stream's input + * buffer. + */ +static inline void applet_have_no_more_data(struct appctx *appctx) +{ + se_fl_set(appctx->sedesc, SE_FL_HAVE_NO_DATA); +} + +/* The applet indicates that it's ready to consume data from the stream's + * output buffer. Rely on the corresponding SE function + */ +static inline void applet_will_consume(struct appctx *appctx) +{ + se_will_consume(appctx->sedesc); +} + +/* The applet indicates that it's not willing to consume data from the stream's + * output buffer. Rely on the corresponding SE function + */ +static inline void applet_wont_consume(struct appctx *appctx) +{ + se_wont_consume(appctx->sedesc); +} + +/* The applet indicates that it's willing to consume data from the stream's + * output buffer, but that there's not enough, so it doesn't want to be woken + * up until more are presented. Rely on the corresponding SE function + */ +static inline void applet_need_more_data(struct appctx *appctx) +{ + se_need_more_data(appctx->sedesc); +} + +/* The applet indicates that it does not expect data from the opposite endpoint. + * This way the stream know it should not trigger read timeout on the other + * side. + */ +static inline void applet_expect_no_data(struct appctx *appctx) +{ + se_fl_set(appctx->sedesc, SE_FL_EXP_NO_DATA); +} + +/* The applet indicates that it expects data from the opposite endpoint. This + * way the stream know it may trigger read timeout on the other side. + */ +static inline void applet_expect_data(struct appctx *appctx) +{ + se_fl_clr(appctx->sedesc, SE_FL_EXP_NO_DATA); +} + +/* writes chunk <chunk> into the input channel of the stream attached to this + * appctx's endpoint, and marks the SC_FL_NEED_ROOM on a channel full error. + * See ci_putchk() for the list of return codes. + */ +static inline int applet_putchk(struct appctx *appctx, struct buffer *chunk) +{ + struct sedesc *se = appctx->sedesc; + int ret; + + ret = ci_putchk(sc_ic(se->sc), chunk); + if (ret < 0) { + /* XXX: Handle all errors as a lack of space because callers + * don't handles other cases for now. So applets must be + * careful to handles shutdown (-2) and invalid calls (-3) by + * themselves. + */ + sc_need_room(se->sc, chunk->data); + ret = -1; + } + + return ret; +} + +/* writes <len> chars from <blk> into the input channel of the stream attached + * to this appctx's endpoint, and marks the SC_FL_NEED_ROOM on a channel full + * error. See ci_putblk() for the list of return codes. + */ +static inline int applet_putblk(struct appctx *appctx, const char *blk, int len) +{ + struct sedesc *se = appctx->sedesc; + int ret; + + ret = ci_putblk(sc_ic(se->sc), blk, len); + if (ret < -1) { + /* XXX: Handle all errors as a lack of space because callers + * don't handles other cases for now. So applets must be + * careful to handles shutdown (-2) and invalid calls (-3) by + * themselves. + */ + sc_need_room(se->sc, len); + ret = -1; + } + + return ret; +} + +/* writes chars from <str> up to the trailing zero (excluded) into the input + * channel of the stream attached to this appctx's endpoint, and marks the + * SC_FL_NEED_ROOM on a channel full error. See ci_putstr() for the list of + * return codes. + */ +static inline int applet_putstr(struct appctx *appctx, const char *str) +{ + struct sedesc *se = appctx->sedesc; + int ret; + + ret = ci_putstr(sc_ic(se->sc), str); + if (ret == -1) { + /* XXX: Handle all errors as a lack of space because callers + * don't handles other cases for now. So applets must be + * careful to handles shutdown (-2) and invalid calls (-3) by + * themselves. + */ + sc_need_room(se->sc, strlen(str)); + ret = -1; + } + + return ret; +} + +/* writes character <chr> into the input channel of the stream attached to this + * appctx's endpoint, and marks the SC_FL_NEED_ROOM on a channel full error. + * See ci_putchr() for the list of return codes. + */ +static inline int applet_putchr(struct appctx *appctx, char chr) +{ + struct sedesc *se = appctx->sedesc; + int ret; + + ret = ci_putchr(sc_ic(se->sc), chr); + if (ret == -1) { + /* XXX: Handle all errors as a lack of space because callers + * don't handles other cases for now. So applets must be + * careful to handles shutdown (-2) and invalid calls (-3) by + * themselves. + */ + sc_need_room(se->sc, 1); + ret = -1; + } + + return ret; +} + +#endif /* _HAPROXY_APPLET_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/arg-t.h b/include/haproxy/arg-t.h new file mode 100644 index 0000000..d90d326 --- /dev/null +++ b/include/haproxy/arg-t.h @@ -0,0 +1,152 @@ +/* + * include/haproxy/arg-t.h + * This file contains structure declarations for generaic argument parsing. + * + * Copyright 2012 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ARG_T_H +#define _HAPROXY_ARG_T_H + +#include <sys/socket.h> +#include <netinet/in.h> + +#include <haproxy/buf-t.h> +#include <haproxy/protobuf-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/vars-t.h> + +/* encoding of each arg type : up to 31 types are supported */ +#define ARGT_BITS 5 +#define ARGT_NBTYPES (1 << ARGT_BITS) +#define ARGT_MASK (ARGT_NBTYPES - 1) + +/* encoding of the arg count : up to 12 args are possible. 4 bits are left + * unused at the top. + */ +#define ARGM_MASK ((1 << ARGM_BITS) - 1) +#define ARGM_BITS 4 +#define ARGM_NBARGS (sizeof(uint64_t) * 8 - ARGM_BITS) / ARGT_BITS + +enum { + ARGT_STOP = 0, /* end of the arg list */ + ARGT_SINT, /* signed 64 bit integer. */ + ARGT_STR, /* string */ + ARGT_IPV4, /* an IPv4 address */ + ARGT_MSK4, /* an IPv4 address mask (integer or dotted), stored as ARGT_IPV4 */ + ARGT_IPV6, /* an IPv6 address */ + ARGT_MSK6, /* an IPv6 address mask (integer or dotted), stored as ARGT_IPV6 */ + ARGT_TIME, /* a delay in ms by default, stored as ARGT_UINT */ + ARGT_SIZE, /* a size in bytes by default, stored as ARGT_UINT */ + ARGT_FE, /* a pointer to a frontend only */ + ARGT_BE, /* a pointer to a backend only */ + ARGT_TAB, /* a pointer to a stick table */ + ARGT_SRV, /* a pointer to a server */ + ARGT_USR, /* a pointer to a user list */ + ARGT_MAP, /* a pointer to a map descriptor */ + ARGT_REG, /* a pointer to a regex */ + ARGT_VAR, /* contains a variable description. */ + ARGT_PBUF_FNUM, /* a protocol buffer field number */ + ARGT_PTR, /* a pointer to opaque data */ + /* please update arg_type_names[] in args.c if you add entries here */ +}; + +/* context where arguments are used, in order to help error reporting */ +enum { + ARGC_ACL = 0, /* ACL */ + ARGC_STK, /* sticking rule */ + ARGC_TRK, /* tracking rule */ + ARGC_LOG, /* log-format */ + ARGC_LOGSD, /* log-format-sd */ + ARGC_HRQ, /* http-request */ + ARGC_HRS, /* http-response */ + ARGC_UIF, /* unique-id-format */ + ARGC_RDR, /* redirect */ + ARGC_CAP, /* capture rule */ + ARGC_SRV, /* server line */ + ARGC_SPOE, /* spoe message args */ + ARGC_UBK, /* use_backend message */ + ARGC_USRV, /* use-server message */ + ARGC_HERR, /* http-error */ + ARGC_OT, /* opentracing scope args */ + ARGC_OPT, /* option directive */ + ARGC_TCO, /* tcp-request connection expression */ + ARGC_TSE, /* tcp-request session expression */ + ARGC_TRQ, /* tcp-request content expression */ + ARGC_TRS, /* tcp-response content expression */ + ARGC_TCK, /* tcp-check expression */ + ARGC_CFG, /* configuration expression */ + ARGC_CLI, /* CLI expression*/ +}; + +/* flags used when compiling and executing regex */ +#define ARGF_REG_ICASE 1 +#define ARGF_REG_GLOB 2 + +/* some types that are externally defined */ +struct proxy; +struct server; +struct userlist; +struct my_regex; + +union arg_data { + long long int sint; + struct buffer str; + struct in_addr ipv4; + struct in6_addr ipv6; + struct proxy *prx; /* used for fe, be, tables */ + struct server *srv; + struct stktable *t; + struct userlist *usr; + struct map_descriptor *map; + struct my_regex *reg; + struct pbuf_fid fid; + struct var_desc var; + void *ptr; +}; + +struct arg { + unsigned char type; /* argument type, ARGT_* */ + unsigned char unresolved; /* argument contains a string in <str> that must be resolved and freed */ + unsigned char type_flags; /* type-specific extra flags (eg: case sensitivity for regex), ARGF_* */ + union arg_data data; /* argument data */ +}; + +/* arg lists are used to store information about arguments that could not be + * resolved when parsing the configuration. The head is an arg_list which + * serves as a template to create new entries. Nothing here is allocated, + * so plain copies are OK. + */ +struct arg_list { + struct list list; /* chaining with other arg_list, or list head */ + struct arg *arg; /* pointer to the arg, NULL on list head */ + int arg_pos; /* argument position */ + int ctx; /* context where the arg is used (ARGC_*) */ + const char *kw; /* keyword making use of these args */ + const char *conv; /* conv keyword when in conv, otherwise NULL */ + const char *file; /* file name where the args are referenced */ + int line; /* line number where the args are referenced */ +}; + +#endif /* _HAPROXY_ARG_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/arg.h b/include/haproxy/arg.h new file mode 100644 index 0000000..5fe1888 --- /dev/null +++ b/include/haproxy/arg.h @@ -0,0 +1,94 @@ +/* + * include/haproxy/arg.h + * This file contains functions and macros declarations for generic argument parsing. + * + * Copyright 2012 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ARG_H +#define _HAPROXY_ARG_H + +#include <haproxy/arg-t.h> + +/* Some macros used to build some arg list. We can declare various argument + * combinations from 0 to 7 args using a single 32-bit integer. The first + * argument of these macros is always the mandatory number of arguments, and + * remaining ones are optional args. Note: ARGM() may also be used to return + * the number of mandatory arguments in a mask. + */ +#define ARGM(m) \ + (uint64_t)(m & ARGM_MASK) + +#define ARG1(m, t1) \ + (ARGM(m) + ((uint64_t)ARGT_##t1 << (ARGM_BITS))) + +#define ARG2(m, t1, t2) \ + (ARG1(m, t1) + ((uint64_t)ARGT_##t2 << (ARGM_BITS + ARGT_BITS))) + +#define ARG3(m, t1, t2, t3) \ + (ARG2(m, t1, t2) + ((uint64_t)ARGT_##t3 << (ARGM_BITS + ARGT_BITS * 2))) + +#define ARG4(m, t1, t2, t3, t4) \ + (ARG3(m, t1, t2, t3) + ((uint64_t)ARGT_##t4 << (ARGM_BITS + ARGT_BITS * 3))) + +#define ARG5(m, t1, t2, t3, t4, t5) \ + (ARG4(m, t1, t2, t3, t4) + ((uint64_t)ARGT_##t5 << (ARGM_BITS + ARGT_BITS * 4))) + +#define ARG6(m, t1, t2, t3, t4, t5, t6) \ + (ARG5(m, t1, t2, t3, t4, t5) + ((uint64_t)ARGT_##t6 << (ARGM_BITS + ARGT_BITS * 5))) + +#define ARG7(m, t1, t2, t3, t4, t5, t6, t7) \ + (ARG6(m, t1, t2, t3, t4, t5, t6) + ((uint64_t)ARGT_##t7 << (ARGM_BITS + ARGT_BITS * 6))) + +#define ARG8(m, t1, t2, t3, t4, t5, t6, t7, t8) \ + (ARG7(m, t1, t2, t3, t4, t5, t6, t7) + ((uint64_t)ARGT_##t8 << (ARGM_BITS + ARGT_BITS * 7))) + +#define ARG9(m, t1, t2, t3, t4, t5, t6, t7, t8, t9) \ + (ARG8(m, t1, t2, t3, t4, t5, t6, t7, t8) + ((uint64_t)ARGT_##t9 << (ARGM_BITS + ARGT_BITS * 8))) + +#define ARG10(m, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) \ + (ARG9(m, t1, t2, t3, t4, t5, t6, t7, t8, t9) + ((uint64_t)ARGT_##t10 << (ARGM_BITS + ARGT_BITS * 9))) + +#define ARG11(m, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \ + (ARG10(m, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) + ((uint64_t)ARGT_##t11 << (ARGM_BITS + ARGT_BITS * 10))) + +#define ARG12(m, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12) \ + (ARG11(m, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) + ((uint64_t)ARGT_##t12 << (ARGM_BITS + ARGT_BITS * 11))) + +/* Mapping between argument number and literal description. */ +extern const char *arg_type_names[]; + +/* This dummy arg list may be used by default when no arg is found, it helps + * parsers by removing pointer checks. + */ +extern struct arg empty_arg_list[ARGM_NBARGS]; + +struct arg_list *arg_list_clone(const struct arg_list *orig); +struct arg_list *arg_list_add(struct arg_list *orig, struct arg *arg, int pos); +int make_arg_list(const char *in, int len, uint64_t mask, struct arg **argp, + char **err_msg, const char **end_ptr, int *err_arg, + struct arg_list *al); +struct arg *free_args(struct arg *args); + +#endif /* _HAPROXY_ARG_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/atomic.h b/include/haproxy/atomic.h new file mode 100644 index 0000000..d64e192 --- /dev/null +++ b/include/haproxy/atomic.h @@ -0,0 +1,897 @@ +/* + * include/haproxy/atomic.h + * Macros and inline functions for thread-safe atomic operations. + * + * Copyright (C) 2017 Christopher Faulet - cfaulet@haproxy.com + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ATOMIC_H +#define _HAPROXY_ATOMIC_H + +#include <haproxy/compiler.h> + +/* A few notes for the macros and functions here: + * - this file is painful to edit, most operations exist in 3 variants, + * no-thread, threads with gcc<4.7, threads with gcc>=4.7. Be careful when + * modifying it not to break any of them. + * + * - macros named HA_ATOMIC_* are or use in the general case, they contain the + * required memory barriers to guarantee sequential consistency + * + * - macros named _HA_ATOMIC_* are the same but without the memory barriers, + * so they may only be used if followed by other HA_ATOMIC_* or within a + * sequence of _HA_ATOMIC_* terminated by a store barrier, or when there is + * no data dependency (e.g. updating a counter). Not all of them are + * implemented, in which case fallbacks to the safe ones are provided. In + * case of doubt, don't use them and use the generic ones instead. + * + * - the __ha_atomic_* barriers are for use around _HA_ATOMIC_* operations. + * Some architectures make them useless and they will automatically be + * dropped in such a case. Don't use them outside of this use case. + * + * - in general, the more underscores you find in front of a function or macro + * name, the riskier it is to use. Barriers are among them because validating + * their usage is not trivial at all and it's often safer to fall back to + * more generic behaviors. + * + * There is also a compiler barrier (__ha_compiler_barrier) which is eliminated + * when threads are disabled. We currently don't have a permanent compiler + * barrier to prevent the compiler from reordering signal-sensitive code for + * example. + */ + + +#ifndef USE_THREAD + +/* Threads are DISABLED, atomic ops are also not used. Note that these MUST + * NOT be used for inter-process synchronization nor signal-safe variable + * manipulations which might occur without threads, as they are not atomic. + */ + +#define HA_ATOMIC_LOAD(val) *(val) +#define HA_ATOMIC_STORE(val, new) ({*(val) = new;}) + +#define HA_ATOMIC_XCHG(val, new) \ + ({ \ + typeof(*(val)) __old_xchg = *(val); \ + *(val) = new; \ + __old_xchg; \ + }) + +#define HA_ATOMIC_AND(val, flags) do { *(val) &= (flags);} while (0) +#define HA_ATOMIC_OR(val, flags) do { *(val) |= (flags);} while (0) +#define HA_ATOMIC_ADD(val, i) do { *(val) += (i);} while (0) +#define HA_ATOMIC_SUB(val, i) do { *(val) -= (i);} while (0) +#define HA_ATOMIC_INC(val) do { *(val) += 1;} while (0) +#define HA_ATOMIC_DEC(val) do { *(val) -= 1;} while (0) + +#define HA_ATOMIC_AND_FETCH(val, flags) ({ *(val) &= (flags); }) +#define HA_ATOMIC_OR_FETCH(val, flags) ({ *(val) |= (flags); }) +#define HA_ATOMIC_ADD_FETCH(val, i) ({ *(val) += (i); }) +#define HA_ATOMIC_SUB_FETCH(val, i) ({ *(val) -= (i); }) + +#define HA_ATOMIC_FETCH_AND(val, i) \ + ({ \ + typeof((val)) __p_val = (val); \ + typeof(*(val)) __old_val = *__p_val; \ + *__p_val &= (i); \ + __old_val; \ + }) + +#define HA_ATOMIC_FETCH_OR(val, i) \ + ({ \ + typeof((val)) __p_val = (val); \ + typeof(*(val)) __old_val = *__p_val; \ + *__p_val |= (i); \ + __old_val; \ + }) + +#define HA_ATOMIC_FETCH_ADD(val, i) \ + ({ \ + typeof((val)) __p_val = (val); \ + typeof(*(val)) __old_val = *__p_val; \ + *__p_val += (i); \ + __old_val; \ + }) + +#define HA_ATOMIC_FETCH_SUB(val, i) \ + ({ \ + typeof((val)) __p_val = (val); \ + typeof(*(val)) __old_val = *__p_val; \ + *__p_val -= (i); \ + __old_val; \ + }) + +#define HA_ATOMIC_BTS(val, bit) \ + ({ \ + typeof((val)) __p_bts = (val); \ + typeof(*__p_bts) __b_bts = (1UL << (bit)); \ + typeof(*__p_bts) __t_bts = *__p_bts & __b_bts; \ + if (!__t_bts) \ + *__p_bts |= __b_bts; \ + __t_bts; \ + }) + +#define HA_ATOMIC_BTR(val, bit) \ + ({ \ + typeof((val)) __p_btr = (val); \ + typeof(*__p_btr) __b_btr = (1UL << (bit)); \ + typeof(*__p_btr) __t_btr = *__p_btr & __b_btr; \ + if (__t_btr) \ + *__p_btr &= ~__b_btr; \ + __t_btr; \ + }) + +#define HA_ATOMIC_CAS(val, old, new) \ + ({ \ + typeof(val) _v = (val); \ + typeof(old) _o = (old); \ + (*_v == *_o) ? ((*_v = (new)), 1) : ((*_o = *_v), 0); \ + }) + +/* warning, n is a pointer to the double value for dwcas */ +#define HA_ATOMIC_DWCAS(val, o, n) \ + ({ \ + long *_v = (long*)(val); \ + long *_o = (long*)(o); \ + long *_n = (long*)(n); \ + long _v0 = _v[0], _v1 = _v[1]; \ + (_v0 == _o[0] && _v1 == _o[1]) ? \ + (_v[0] = _n[0], _v[1] = _n[1], 1) : \ + (_o[0] = _v0, _o[1] = _v1, 0); \ + }) + +#define HA_ATOMIC_UPDATE_MAX(val, new) \ + ({ \ + typeof(val) __val = (val); \ + typeof(*(val)) __new_max = (new); \ + \ + if (*__val < __new_max) \ + *__val = __new_max; \ + *__val; \ + }) + +#define HA_ATOMIC_UPDATE_MIN(val, new) \ + ({ \ + typeof(val) __val = (val); \ + typeof(*(val)) __new_min = (new); \ + \ + if (*__val > __new_min) \ + *__val = __new_min; \ + *__val; \ + }) + +/* various barriers */ +#define __ha_barrier_atomic_load() do { } while (0) +#define __ha_barrier_atomic_store() do { } while (0) +#define __ha_barrier_atomic_full() do { } while (0) +#define __ha_barrier_load() do { } while (0) +#define __ha_barrier_store() do { } while (0) +#define __ha_barrier_full() do { } while (0) +#define __ha_compiler_barrier() do { } while (0) +#define __ha_cpu_relax() ({ 1; }) + +#else /* !USE_THREAD */ + +/* Threads are ENABLED, all atomic ops are made thread-safe. By extension they + * can also be used for inter-process synchronization but one must verify that + * the code still builds with threads disabled. + */ + +#if defined(__GNUC__) && (__GNUC__ < 4 || __GNUC__ == 4 && __GNUC_MINOR__ < 7) && !defined(__clang__) +/* gcc < 4.7 */ + +#define HA_ATOMIC_LOAD(val) \ + ({ \ + typeof(*(val)) ret = \ + ({ __sync_synchronize(); *(volatile typeof(val))val; }); \ + __sync_synchronize(); \ + ret; \ + }) + +#define HA_ATOMIC_STORE(val, new) \ + ({ \ + typeof((val)) __val_store = (val); \ + typeof(*(val)) __old_store; \ + typeof((new)) __new_store = (new); \ + do { __old_store = *__val_store; \ + } while (!__sync_bool_compare_and_swap(__val_store, __old_store, __new_store) && __ha_cpu_relax()); \ + }) + +#define HA_ATOMIC_XCHG(val, new) \ + ({ \ + typeof((val)) __val_xchg = (val); \ + typeof(*(val)) __old_xchg; \ + typeof((new)) __new_xchg = (new); \ + do { __old_xchg = *__val_xchg; \ + } while (!__sync_bool_compare_and_swap(__val_xchg, __old_xchg, __new_xchg) && __ha_cpu_relax()); \ + __old_xchg; \ + }) + +#define HA_ATOMIC_AND(val, flags) do { __sync_and_and_fetch(val, flags); } while (0) +#define HA_ATOMIC_OR(val, flags) do { __sync_or_and_fetch(val, flags); } while (0) +#define HA_ATOMIC_ADD(val, i) do { __sync_add_and_fetch(val, i); } while (0) +#define HA_ATOMIC_SUB(val, i) do { __sync_sub_and_fetch(val, i); } while (0) +#define HA_ATOMIC_INC(val) do { __sync_add_and_fetch(val, 1); } while (0) +#define HA_ATOMIC_DEC(val) do { __sync_sub_and_fetch(val, 1); } while (0) + +#define HA_ATOMIC_AND_FETCH(val, flags) __sync_and_and_fetch(val, flags) +#define HA_ATOMIC_OR_FETCH(val, flags) __sync_or_and_fetch(val, flags) +#define HA_ATOMIC_ADD_FETCH(val, i) __sync_add_and_fetch(val, i) +#define HA_ATOMIC_SUB_FETCH(val, i) __sync_sub_and_fetch(val, i) + +#define HA_ATOMIC_FETCH_AND(val, flags) __sync_fetch_and_and(val, flags) +#define HA_ATOMIC_FETCH_OR(val, flags) __sync_fetch_and_or(val, flags) +#define HA_ATOMIC_FETCH_ADD(val, i) __sync_fetch_and_add(val, i) +#define HA_ATOMIC_FETCH_SUB(val, i) __sync_fetch_and_sub(val, i) + +#define HA_ATOMIC_BTS(val, bit) \ + ({ \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __sync_fetch_and_or((val), __b_bts) & __b_bts; \ + }) + +#define HA_ATOMIC_BTR(val, bit) \ + ({ \ + typeof(*(val)) __b_btr = (1UL << (bit)); \ + __sync_fetch_and_and((val), ~__b_btr) & __b_btr; \ + }) + +/* the CAS is a bit complicated. The older API doesn't support returning the + * value and the swap's result at the same time. So here we take what looks + * like the safest route, consisting in using the boolean version guaranteeing + * that the operation was performed or not, and we snoop a previous value. If + * the compare succeeds, we return. If it fails, we return the previous value, + * but only if it differs from the expected one. If it's the same it's a race + * thus we try again to avoid confusing a possibly sensitive caller. + */ +#define HA_ATOMIC_CAS(val, old, new) \ + ({ \ + typeof((val)) __val_cas = (val); \ + typeof((old)) __oldp_cas = (old); \ + typeof(*(old)) __oldv_cas; \ + typeof((new)) __new_cas = (new); \ + int __ret_cas; \ + do { \ + __oldv_cas = *__val_cas; \ + __ret_cas = __sync_bool_compare_and_swap(__val_cas, *__oldp_cas, __new_cas); \ + } while (!__ret_cas && *__oldp_cas == __oldv_cas && __ha_cpu_relax()); \ + if (!__ret_cas) \ + *__oldp_cas = __oldv_cas; \ + __ret_cas; \ + }) + +/* warning, n is a pointer to the double value for dwcas */ +#define HA_ATOMIC_DWCAS(val, o, n) __ha_cas_dw(val, o, n) + +#define HA_ATOMIC_UPDATE_MAX(val, new) \ + ({ \ + typeof(val) __val = (val); \ + typeof(*(val)) __old_max = *__val; \ + typeof(*(val)) __new_max = (new); \ + \ + while (__old_max < __new_max && \ + !HA_ATOMIC_CAS(__val, &__old_max, __new_max) && __ha_cpu_relax()); \ + *__val; \ + }) + +#define HA_ATOMIC_UPDATE_MIN(val, new) \ + ({ \ + typeof(val) __val = (val); \ + typeof(*(val)) __old_min = *__val; \ + typeof(*(val)) __new_min = (new); \ + \ + while (__old_min > __new_min && \ + !HA_ATOMIC_CAS(__val, &__old_min, __new_min) && __ha_cpu_relax()); \ + *__val; \ + }) + +#else /* gcc */ + +/* gcc >= 4.7 or clang */ + +#define HA_ATOMIC_STORE(val, new) __atomic_store_n(val, new, __ATOMIC_RELEASE) +#define HA_ATOMIC_LOAD(val) __atomic_load_n(val, __ATOMIC_ACQUIRE) +#define HA_ATOMIC_XCHG(val, new) __atomic_exchange_n(val, new, __ATOMIC_ACQ_REL) + +#define HA_ATOMIC_AND(val, flags) do { __atomic_and_fetch(val, flags, __ATOMIC_SEQ_CST); } while (0) +#define HA_ATOMIC_OR(val, flags) do { __atomic_or_fetch(val, flags, __ATOMIC_SEQ_CST); } while (0) +#define HA_ATOMIC_ADD(val, i) do { __atomic_add_fetch(val, i, __ATOMIC_SEQ_CST); } while (0) +#define HA_ATOMIC_SUB(val, i) do { __atomic_sub_fetch(val, i, __ATOMIC_SEQ_CST); } while (0) +#define HA_ATOMIC_INC(val) do { __atomic_add_fetch(val, 1, __ATOMIC_SEQ_CST); } while (0) +#define HA_ATOMIC_DEC(val) do { __atomic_sub_fetch(val, 1, __ATOMIC_SEQ_CST); } while (0) + +#define HA_ATOMIC_AND_FETCH(val, flags) __atomic_and_fetch(val, flags, __ATOMIC_SEQ_CST) +#define HA_ATOMIC_OR_FETCH(val, flags) __atomic_or_fetch(val, flags, __ATOMIC_SEQ_CST) +#define HA_ATOMIC_ADD_FETCH(val, i) __atomic_add_fetch(val, i, __ATOMIC_SEQ_CST) +#define HA_ATOMIC_SUB_FETCH(val, i) __atomic_sub_fetch(val, i, __ATOMIC_SEQ_CST) + +#define HA_ATOMIC_FETCH_AND(val, flags) __atomic_fetch_and(val, flags, __ATOMIC_SEQ_CST) +#define HA_ATOMIC_FETCH_OR(val, flags) __atomic_fetch_or(val, flags, __ATOMIC_SEQ_CST) +#define HA_ATOMIC_FETCH_ADD(val, i) __atomic_fetch_add(val, i, __ATOMIC_SEQ_CST) +#define HA_ATOMIC_FETCH_SUB(val, i) __atomic_fetch_sub(val, i, __ATOMIC_SEQ_CST) + +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && (defined(__i386__) || defined (__x86_64__)) +#define HA_ATOMIC_BTS(val, bit) \ + ({ \ + unsigned char __ret; \ + if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \ + asm volatile("lock btsq %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 4) { \ + asm volatile("lock btsl %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 2) { \ + asm volatile("lock btsw %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + } else { \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __ret = !!(__atomic_fetch_or((val), __b_bts, __ATOMIC_SEQ_CST) & __b_bts); \ + } \ + __ret; \ + }) + +#define HA_ATOMIC_BTR(val, bit) \ + ({ \ + unsigned char __ret; \ + if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \ + asm volatile("lock btrq %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 4) { \ + asm volatile("lock btrl %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 2) { \ + asm volatile("lock btrw %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + } else { \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __ret = !!(__atomic_fetch_and((val), ~__b_bts, __ATOMIC_SEQ_CST) & __b_bts); \ + } \ + __ret; \ + }) + +#else // not x86 or !__GCC_ASM_FLAG_OUTPUTS__ + +#define HA_ATOMIC_BTS(val, bit) \ + ({ \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __atomic_fetch_or((val), __b_bts, __ATOMIC_SEQ_CST) & __b_bts; \ + }) + +#define HA_ATOMIC_BTR(val, bit) \ + ({ \ + typeof(*(val)) __b_btr = (1UL << (bit)); \ + __atomic_fetch_and((val), ~__b_btr, __ATOMIC_SEQ_CST) & __b_btr; \ + }) + +#endif // x86 || __GCC_ASM_FLAG_OUTPUTS__ + +#define HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST) + +/* warning, n is a pointer to the double value for dwcas */ +#define HA_ATOMIC_DWCAS(val, o, n) __ha_cas_dw(val, o, n) + +#define HA_ATOMIC_UPDATE_MAX(val, new) \ + ({ \ + typeof(val) __val = (val); \ + typeof(*(val)) __old_max = *__val; \ + typeof(*(val)) __new_max = (new); \ + \ + while (__old_max < __new_max && \ + !HA_ATOMIC_CAS(__val, &__old_max, __new_max) && __ha_cpu_relax()); \ + *__val; \ + }) + +#define HA_ATOMIC_UPDATE_MIN(val, new) \ + ({ \ + typeof(val) __val = (val); \ + typeof(*(val)) __old_min = *__val; \ + typeof(*(val)) __new_min = (new); \ + \ + while (__old_min > __new_min && \ + !HA_ATOMIC_CAS(__val, &__old_min, __new_min) && __ha_cpu_relax()); \ + *__val; \ + }) + +/* Modern compilers provide variants that don't generate any memory barrier. + * If you're unsure how to deal with barriers, just use the HA_ATOMIC_* version, + * that will always generate correct code. + * Usually it's fine to use those when updating data that have no dependency, + * ie updating a counter. Otherwise a barrier is required. + */ + +#define _HA_ATOMIC_LOAD(val) __atomic_load_n(val, __ATOMIC_RELAXED) +#define _HA_ATOMIC_STORE(val, new) __atomic_store_n(val, new, __ATOMIC_RELAXED) +#define _HA_ATOMIC_XCHG(val, new) __atomic_exchange_n(val, new, __ATOMIC_RELAXED) + +#define _HA_ATOMIC_AND(val, flags) do { __atomic_and_fetch(val, flags, __ATOMIC_RELAXED); } while (0) +#define _HA_ATOMIC_OR(val, flags) do { __atomic_or_fetch(val, flags, __ATOMIC_RELAXED); } while (0) +#define _HA_ATOMIC_ADD(val, i) do { __atomic_add_fetch(val, i, __ATOMIC_RELAXED); } while (0) +#define _HA_ATOMIC_SUB(val, i) do { __atomic_sub_fetch(val, i, __ATOMIC_RELAXED); } while (0) +#define _HA_ATOMIC_INC(val) do { __atomic_add_fetch(val, 1, __ATOMIC_RELAXED); } while (0) +#define _HA_ATOMIC_DEC(val) do { __atomic_sub_fetch(val, 1, __ATOMIC_RELAXED); } while (0) + +#define _HA_ATOMIC_AND_FETCH(val, flags) __atomic_and_fetch(val, flags, __ATOMIC_RELAXED) +#define _HA_ATOMIC_OR_FETCH(val, flags) __atomic_or_fetch(val, flags, __ATOMIC_RELAXED) +#define _HA_ATOMIC_ADD_FETCH(val, i) __atomic_add_fetch(val, i, __ATOMIC_RELAXED) +#define _HA_ATOMIC_SUB_FETCH(val, i) __atomic_sub_fetch(val, i, __ATOMIC_RELAXED) + +#define _HA_ATOMIC_FETCH_AND(val, flags) __atomic_fetch_and(val, flags, __ATOMIC_RELAXED) +#define _HA_ATOMIC_FETCH_OR(val, flags) __atomic_fetch_or(val, flags, __ATOMIC_RELAXED) +#define _HA_ATOMIC_FETCH_ADD(val, i) __atomic_fetch_add(val, i, __ATOMIC_RELAXED) +#define _HA_ATOMIC_FETCH_SUB(val, i) __atomic_fetch_sub(val, i, __ATOMIC_RELAXED) + +#if defined(__GCC_ASM_FLAG_OUTPUTS__) && (defined(__i386__) || defined (__x86_64__)) +#define _HA_ATOMIC_BTS(val, bit) \ + ({ \ + unsigned char __ret; \ + if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \ + asm volatile("lock btsq %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 4) { \ + asm volatile("lock btsl %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 2) { \ + asm volatile("lock btsw %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + } else { \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __ret = !!(__atomic_fetch_or((val), __b_bts, __ATOMIC_RELAXED) & __b_bts); \ + } \ + __ret; \ + }) + +#define _HA_ATOMIC_BTR(val, bit) \ + ({ \ + unsigned char __ret; \ + if (sizeof(long) == 8 && sizeof(*(val)) == 8) { \ + asm volatile("lock btrq %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 4) { \ + asm volatile("lock btrl %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + } else if (sizeof(*(val)) == 2) { \ + asm volatile("lock btrw %2, %0\n" \ + : "+m" (*(val)), "=@ccc"(__ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + } else { \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __ret = !!(__atomic_fetch_and((val), ~__b_bts, __ATOMIC_RELAXED) & __b_bts); \ + } \ + __ret; \ + }) + +#else // not x86 or !__GCC_ASM_FLAG_OUTPUTS__ + +#define _HA_ATOMIC_BTS(val, bit) \ + ({ \ + typeof(*(val)) __b_bts = (1UL << (bit)); \ + __atomic_fetch_or((val), __b_bts, __ATOMIC_RELAXED) & __b_bts; \ + }) + +#define _HA_ATOMIC_BTR(val, bit) \ + ({ \ + typeof(*(val)) __b_btr = (1UL << (bit)); \ + __atomic_fetch_and((val), ~__b_btr, __ATOMIC_RELAXED) & __b_btr; \ + }) +#endif + +#define _HA_ATOMIC_CAS(val, old, new) __atomic_compare_exchange_n(val, old, new, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED) +/* warning, n is a pointer to the double value for dwcas */ +#define _HA_ATOMIC_DWCAS(val, o, n) __ha_cas_dw(val, o, n) + +#endif /* gcc >= 4.7 */ + +/* Here come a few architecture-specific double-word CAS and barrier + * implementations. + */ + +#ifdef __x86_64__ + +static __inline void +__ha_barrier_load(void) +{ + __asm __volatile("" ::: "memory"); +} + +static __inline void +__ha_barrier_store(void) +{ + __asm __volatile("" ::: "memory"); +} + +static __inline void +__ha_barrier_full(void) +{ + __asm __volatile("mfence" ::: "memory"); +} + +/* Use __ha_barrier_atomic* when you're trying to protect data that are + * are modified using _HA_ATOMIC* + */ +static __inline void +__ha_barrier_atomic_load(void) +{ + __asm __volatile("" ::: "memory"); +} + +static __inline void +__ha_barrier_atomic_store(void) +{ + __asm __volatile("" ::: "memory"); +} + +static __inline void +__ha_barrier_atomic_full(void) +{ + __asm __volatile("" ::: "memory"); +} + +static __inline int +__ha_cas_dw(void *target, void *compare, const void *set) +{ + char ret; + + __asm __volatile("lock cmpxchg16b %0; setz %3" + : "+m" (*(void **)target), + "=a" (((void **)compare)[0]), + "=d" (((void **)compare)[1]), + "=q" (ret) + : "a" (((void **)compare)[0]), + "d" (((void **)compare)[1]), + "b" (((const void **)set)[0]), + "c" (((const void **)set)[1]) + : "memory", "cc"); + return (ret); +} + +/* short-lived CPU relaxation */ +#define __ha_cpu_relax() ({ asm volatile("rep;nop\n"); 1; }) + +#elif defined(__arm__) && (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)) + +static __inline void +__ha_barrier_load(void) +{ + __asm __volatile("dmb" ::: "memory"); +} + +static __inline void +__ha_barrier_store(void) +{ + __asm __volatile("dsb" ::: "memory"); +} + +static __inline void +__ha_barrier_full(void) +{ + __asm __volatile("dmb" ::: "memory"); +} + +/* Use __ha_barrier_atomic* when you're trying to protect data that are + * are modified using _HA_ATOMIC* + */ +static __inline void +__ha_barrier_atomic_load(void) +{ + __asm __volatile("dmb" ::: "memory"); +} + +static __inline void +__ha_barrier_atomic_store(void) +{ + __asm __volatile("dsb" ::: "memory"); +} + +static __inline void +__ha_barrier_atomic_full(void) +{ + __asm __volatile("dmb" ::: "memory"); +} + +static __inline int __ha_cas_dw(void *target, void *compare, const void *set) +{ + uint64_t previous; + int tmp; + + __asm __volatile("1:" + "ldrexd %0, [%4];" + "cmp %Q0, %Q2;" + "ittt eq;" + "cmpeq %R0, %R2;" + "strexdeq %1, %3, [%4];" + "cmpeq %1, #1;" + "beq 1b;" + : "=&r" (previous), "=&r" (tmp) + : "r" (*(uint64_t *)compare), "r" (*(uint64_t *)set), "r" (target) + : "memory", "cc"); + tmp = (previous == *(uint64_t *)compare); + *(uint64_t *)compare = previous; + return (tmp); +} + +/* short-lived CPU relaxation */ +#define __ha_cpu_relax() ({ asm volatile(""); 1; }) + +#elif defined (__aarch64__) + +static __inline void +__ha_barrier_load(void) +{ + __asm __volatile("dmb ishld" ::: "memory"); +} + +static __inline void +__ha_barrier_store(void) +{ + __asm __volatile("dmb ishst" ::: "memory"); +} + +static __inline void +__ha_barrier_full(void) +{ + __asm __volatile("dmb ish" ::: "memory"); +} + +/* Use __ha_barrier_atomic* when you're trying to protect data that are + * are modified using _HA_ATOMIC* + */ +static __inline void +__ha_barrier_atomic_load(void) +{ + __asm __volatile("dmb ishld" ::: "memory"); +} + +static __inline void +__ha_barrier_atomic_store(void) +{ + __asm __volatile("dmb ishst" ::: "memory"); +} + +static __inline void +__ha_barrier_atomic_full(void) +{ + __asm __volatile("dmb ish" ::: "memory"); +} + +/* short-lived CPU relaxation; this was shown to improve fairness on + * modern ARMv8 cores such as Neoverse N1. + */ +#define __ha_cpu_relax() ({ asm volatile("isb" ::: "memory"); 1; }) + +#if defined(__ARM_FEATURE_ATOMICS) && !defined(__clang__) // ARMv8.1-A atomics + +/* returns 0 on failure, non-zero on success */ +static forceinline int __ha_cas_dw(void *target, void *compare, const void *set) +{ + /* There's no status set by the CASP instruction so we need to keep a + * copy of the original registers and compare them afterwards to detect + * if we could apply the change. In order to pass a pair, we simply map + * a register pair on a struct so that the compiler can emit register + * pairs that we can use thanks to the undocumented "%H" modifier + * mentioned on the link below: + * https://patchwork.ozlabs.org/project/gcc/patch/59368A74.2060908@foss.arm.com/ + */ + struct pair { uint64_t r[2]; }; + register struct pair bck = *(struct pair *)compare; + register struct pair cmp asm("x0") = bck; + register struct pair new asm("x2") = *(const struct pair*)set; + int ret; + + __asm__ __volatile__("casp %0, %H0, %2, %H2, [%1]\n" + : "+r" (cmp) // %0 + : "r" (target), // %1 + "r" (new) // %2 + : "memory"); + + /* if the old value is still the same unchanged, we won, otherwise we + * store the refreshed old value. + */ + ret = cmp.r[0] == bck.r[0] && cmp.r[1] == bck.r[1]; + if (unlikely(!ret)) { + /* update the old value on failure. Note that in this case the + * caller will likely relax and jump backwards so we don't care + * about this cost provided that it doesn't enlarge the fast + * code path. + */ + *(struct pair *)compare = cmp; + } + return ret; +} + +#elif defined(__SIZEOF_INT128__) && defined(_ARM_FEATURE_ATOMICS) // 128-bit and ARMv8.1-A will work + +/* According to https://gcc.gnu.org/onlinedocs/gcc/_005f_005fatomic-Builtins.html + * we can use atomics on __int128. The availability of CAS is defined there: + * https://gcc.gnu.org/onlinedocs/cpp/Common-Predefined-Macros.html + * However these usually involve a function call which can be expensive for some + * cases, but gcc 10.2 and above can reroute the function call to either LL/SC for + * v8.0 or LSE for v8.1+, which allows to use a more scalable version on v8.1+ at + * the extra cost of a function call. + */ + +/* returns 0 on failure, non-zero on success */ +static __inline int __ha_cas_dw(void *target, void *compare, const void *set) +{ + return __atomic_compare_exchange_n((__int128*)target, (__int128*)compare, *(const __int128*)set, + 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED); +} + +#else // neither ARMv8.1-A atomics nor 128-bit atomics + +/* returns 0 on failure, non-zero on success */ +static __inline int __ha_cas_dw(void *target, void *compare, void *set) +{ + void *value[2]; + uint64_t tmp1, tmp2; + + __asm__ __volatile__("1:" + "ldxp %0, %1, [%4]\n" + "mov %2, %0\n" + "mov %3, %1\n" + "eor %0, %0, %5\n" + "eor %1, %1, %6\n" + "orr %1, %0, %1\n" + "mov %w0, #0\n" + "cbnz %1, 2f\n" + "stxp %w0, %7, %8, [%4]\n" + "cbnz %w0, 1b\n" + "mov %w0, #1\n" + "2:" + : "=&r" (tmp1), "=&r" (tmp2), "=&r" (value[0]), "=&r" (value[1]) + : "r" (target), "r" (((void **)(compare))[0]), "r" (((void **)(compare))[1]), "r" (((void **)(set))[0]), "r" (((void **)(set))[1]) + : "cc", "memory"); + + ((void **)(compare))[0] = value[0]; + ((void **)(compare))[1] = value[1]; + return (tmp1); +} +#endif // ARMv8.1-A atomics + +#else /* unknown / unhandled architecture, fall back to generic barriers */ + +#define __ha_barrier_atomic_load __sync_synchronize +#define __ha_barrier_atomic_store __sync_synchronize +#define __ha_barrier_atomic_full __sync_synchronize +#define __ha_barrier_load __sync_synchronize +#define __ha_barrier_store __sync_synchronize +#define __ha_barrier_full __sync_synchronize +/* Note: there is no generic DWCAS */ + +/* short-lived CPU relaxation */ +#define __ha_cpu_relax() ({ asm volatile(""); 1; }) + +#endif /* end of arch-specific barrier/dwcas */ + +static inline void __ha_compiler_barrier(void) +{ + __asm __volatile("" ::: "memory"); +} + +#endif /* USE_THREAD */ + + +/* fallbacks to remap all undefined _HA_ATOMIC_* on to their safe equivalent */ +#ifndef _HA_ATOMIC_BTR +#define _HA_ATOMIC_BTR HA_ATOMIC_BTR +#endif /* !_HA_ATOMIC_BTR */ + +#ifndef _HA_ATOMIC_BTS +#define _HA_ATOMIC_BTS HA_ATOMIC_BTS +#endif /* !_HA_ATOMIC_BTS */ + +#ifndef _HA_ATOMIC_CAS +#define _HA_ATOMIC_CAS HA_ATOMIC_CAS +#endif /* !_HA_ATOMIC_CAS */ + +#ifndef _HA_ATOMIC_DWCAS +#define _HA_ATOMIC_DWCAS HA_ATOMIC_DWCAS +#endif /* !_HA_ATOMIC_CAS */ + +#ifndef _HA_ATOMIC_ADD +#define _HA_ATOMIC_ADD HA_ATOMIC_ADD +#endif /* !_HA_ATOMIC_ADD */ + +#ifndef _HA_ATOMIC_ADD_FETCH +#define _HA_ATOMIC_ADD_FETCH HA_ATOMIC_ADD_FETCH +#endif /* !_HA_ATOMIC_ADD_FETCH */ + +#ifndef _HA_ATOMIC_FETCH_ADD +#define _HA_ATOMIC_FETCH_ADD HA_ATOMIC_FETCH_ADD +#endif /* !_HA_ATOMIC_FETCH_ADD */ + +#ifndef _HA_ATOMIC_SUB +#define _HA_ATOMIC_SUB HA_ATOMIC_SUB +#endif /* !_HA_ATOMIC_SUB */ + +#ifndef _HA_ATOMIC_SUB_FETCH +#define _HA_ATOMIC_SUB_FETCH HA_ATOMIC_SUB_FETCH +#endif /* !_HA_ATOMIC_SUB_FETCH */ + +#ifndef _HA_ATOMIC_FETCH_SUB +#define _HA_ATOMIC_FETCH_SUB HA_ATOMIC_FETCH_SUB +#endif /* !_HA_ATOMIC_FETCH_SUB */ + +#ifndef _HA_ATOMIC_INC +#define _HA_ATOMIC_INC HA_ATOMIC_INC +#endif /* !_HA_ATOMIC_INC */ + +#ifndef _HA_ATOMIC_DEC +#define _HA_ATOMIC_DEC HA_ATOMIC_DEC +#endif /* !_HA_ATOMIC_DEC */ + +#ifndef _HA_ATOMIC_AND +#define _HA_ATOMIC_AND HA_ATOMIC_AND +#endif /* !_HA_ATOMIC_AND */ + +#ifndef _HA_ATOMIC_AND_FETCH +#define _HA_ATOMIC_AND_FETCH HA_ATOMIC_AND_FETCH +#endif /* !_HA_ATOMIC_AND_FETCH */ + +#ifndef _HA_ATOMIC_FETCH_AND +#define _HA_ATOMIC_FETCH_AND HA_ATOMIC_FETCH_AND +#endif /* !_HA_ATOMIC_FETCH_AND */ + +#ifndef _HA_ATOMIC_OR +#define _HA_ATOMIC_OR HA_ATOMIC_OR +#endif /* !_HA_ATOMIC_OR */ + +#ifndef _HA_ATOMIC_OR_FETCH +#define _HA_ATOMIC_OR_FETCH HA_ATOMIC_OR_FETCH +#endif /* !_HA_ATOMIC_OR_FETCH */ + +#ifndef _HA_ATOMIC_FETCH_OR +#define _HA_ATOMIC_FETCH_OR HA_ATOMIC_FETCH_OR +#endif /* !_HA_ATOMIC_FETCH_OR */ + +#ifndef _HA_ATOMIC_XCHG +#define _HA_ATOMIC_XCHG HA_ATOMIC_XCHG +#endif /* !_HA_ATOMIC_XCHG */ + +#ifndef _HA_ATOMIC_STORE +#define _HA_ATOMIC_STORE HA_ATOMIC_STORE +#endif /* !_HA_ATOMIC_STORE */ + +#ifndef _HA_ATOMIC_LOAD +#define _HA_ATOMIC_LOAD HA_ATOMIC_LOAD +#endif /* !_HA_ATOMIC_LOAD */ + +#endif /* _HAPROXY_ATOMIC_H */ diff --git a/include/haproxy/auth-t.h b/include/haproxy/auth-t.h new file mode 100644 index 0000000..35a1ff6 --- /dev/null +++ b/include/haproxy/auth-t.h @@ -0,0 +1,57 @@ +/* + * include/haproxy/auth-t.h + * Types definitions for user authentication & authorization. + * + * Copyright 2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_AUTH_T_H +#define _HAPROXY_AUTH_T_H + +#include <haproxy/api-t.h> + +#define AU_O_INSECURE 0x00000001 /* insecure, unencrypted password */ + +struct auth_groups { + struct auth_groups *next; + char *name; + char *groupusers; /* Just used during the configuration parsing. */ +}; + +struct auth_groups_list { + struct auth_groups_list *next; + struct auth_groups *group; +}; + +struct auth_users { + struct auth_users *next; + unsigned int flags; + char *user, *pass; + union { + char *groups_names; /* Just used during the configuration parsing. */ + struct auth_groups_list *groups; + } u; +}; + +struct userlist { + struct userlist *next; + char *name; + struct auth_users *users; + struct auth_groups *groups; +}; + +#endif /* _HAPROXY_AUTH_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ + diff --git a/include/haproxy/auth.h b/include/haproxy/auth.h new file mode 100644 index 0000000..2fe2b35 --- /dev/null +++ b/include/haproxy/auth.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/auth.h + * Functions for user authentication & authorization. + * + * Copyright 2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_AUTH_H +#define _HAPROXY_AUTH_H + +#include <haproxy/api.h> +#include <haproxy/auth-t.h> +#include <haproxy/pattern-t.h> +#include <haproxy/sample-t.h> + +extern struct userlist *userlist; + +struct userlist *auth_find_userlist(char *name); +unsigned int auth_resolve_groups(struct userlist *l, char *groups); +int userlist_postinit(); +void userlist_free(struct userlist *ul); +struct pattern *pat_match_auth(struct sample *smp, struct pattern_expr *expr, int fill); +int check_user(struct userlist *ul, const char *user, const char *pass); +int check_group(struct userlist *ul, char *name); + +#endif /* _HAPROXY_AUTH_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ + diff --git a/include/haproxy/backend-t.h b/include/haproxy/backend-t.h new file mode 100644 index 0000000..02a2cc5 --- /dev/null +++ b/include/haproxy/backend-t.h @@ -0,0 +1,191 @@ +/* + * include/haproxy/backend-t.h + * This file assembles definitions for backends + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_BACKEND_T_H +#define _HAPROXY_BACKEND_T_H + +#include <haproxy/api-t.h> +#include <haproxy/lb_chash-t.h> +#include <haproxy/lb_fas-t.h> +#include <haproxy/lb_fwlc-t.h> +#include <haproxy/lb_fwrr-t.h> +#include <haproxy/lb_map-t.h> +#include <haproxy/server-t.h> +#include <haproxy/thread-t.h> + +/* Parameters for lbprm.algo */ + +/* Lower bits define the kind of load balancing method, which means the type of + * algorithm, and which criterion it is based on. For this reason, those bits + * also include information about dependencies, so that the config parser can + * detect incompatibilities. + */ + +/* LB parameters are on the lower 8 bits. Depends on the LB kind. */ + +/* BE_LB_HASH_* is used with BE_LB_KIND_HI */ +#define BE_LB_HASH_SRC 0x00000000 /* hash source IP */ +#define BE_LB_HASH_URI 0x00000001 /* hash HTTP URI */ +#define BE_LB_HASH_PRM 0x00000002 /* hash HTTP URL parameter */ +#define BE_LB_HASH_HDR 0x00000003 /* hash HTTP header value */ +#define BE_LB_HASH_RDP 0x00000004 /* hash RDP cookie value */ +#define BE_LB_HASH_SMP 0x00000005 /* hash a sample expression */ + +/* BE_LB_RR_* is used with BE_LB_KIND_RR */ +#define BE_LB_RR_DYN 0x00000000 /* dynamic round robin (default) */ +#define BE_LB_RR_STATIC 0x00000001 /* static round robin */ +#define BE_LB_RR_RANDOM 0x00000002 /* random round robin */ + +/* BE_LB_CB_* is used with BE_LB_KIND_CB */ +#define BE_LB_CB_LC 0x00000000 /* least-connections */ +#define BE_LB_CB_FAS 0x00000001 /* first available server (opposite of leastconn) */ + +#define BE_LB_PARM 0x000000FF /* mask to get/clear the LB param */ + +/* Required input(s) */ +#define BE_LB_NEED_NONE 0x00000000 /* no input needed */ +#define BE_LB_NEED_ADDR 0x00000100 /* only source address needed */ +#define BE_LB_NEED_DATA 0x00000200 /* some payload is needed */ +#define BE_LB_NEED_HTTP 0x00000400 /* an HTTP request is needed */ +#define BE_LB_NEED_LOG 0x00000800 /* LOG backend required */ +#define BE_LB_NEED 0x0000FF00 /* mask to get/clear dependencies */ + +/* Algorithm */ +#define BE_LB_KIND_NONE 0x00000000 /* algorithm not set */ +#define BE_LB_KIND_RR 0x00010000 /* round-robin */ +#define BE_LB_KIND_CB 0x00020000 /* connection-based */ +#define BE_LB_KIND_HI 0x00030000 /* hash of input (see hash inputs above) */ +#define BE_LB_KIND 0x00070000 /* mask to get/clear LB algorithm */ + +/* All known variants of load balancing algorithms. These can be cleared using + * the BE_LB_ALGO mask. For a check, using BE_LB_KIND is preferred. + */ +#define BE_LB_ALGO_NONE (BE_LB_KIND_NONE | BE_LB_NEED_NONE) /* not defined */ +#define BE_LB_ALGO_RR (BE_LB_KIND_RR | BE_LB_NEED_NONE) /* round robin */ +#define BE_LB_ALGO_RND (BE_LB_KIND_RR | BE_LB_NEED_NONE | BE_LB_RR_RANDOM) /* random value */ +#define BE_LB_ALGO_LC (BE_LB_KIND_CB | BE_LB_NEED_NONE | BE_LB_CB_LC) /* least connections */ +#define BE_LB_ALGO_FAS (BE_LB_KIND_CB | BE_LB_NEED_NONE | BE_LB_CB_FAS) /* first available server */ +#define BE_LB_ALGO_SRR (BE_LB_KIND_RR | BE_LB_NEED_NONE | BE_LB_RR_STATIC) /* static round robin */ +#define BE_LB_ALGO_SH (BE_LB_KIND_HI | BE_LB_NEED_ADDR | BE_LB_HASH_SRC) /* hash: source IP */ +#define BE_LB_ALGO_UH (BE_LB_KIND_HI | BE_LB_NEED_HTTP | BE_LB_HASH_URI) /* hash: HTTP URI */ +#define BE_LB_ALGO_PH (BE_LB_KIND_HI | BE_LB_NEED_HTTP | BE_LB_HASH_PRM) /* hash: HTTP URL parameter */ +#define BE_LB_ALGO_HH (BE_LB_KIND_HI | BE_LB_NEED_HTTP | BE_LB_HASH_HDR) /* hash: HTTP header value */ +#define BE_LB_ALGO_RCH (BE_LB_KIND_HI | BE_LB_NEED_DATA | BE_LB_HASH_RDP) /* hash: RDP cookie value */ +#define BE_LB_ALGO_SMP (BE_LB_KIND_HI | BE_LB_NEED_DATA | BE_LB_HASH_SMP) /* hash: sample expression */ +#define BE_LB_ALGO_LH (BE_LB_KIND_HI | BE_LB_NEED_LOG | BE_LB_HASH_SMP) /* log hash: sample expression */ +#define BE_LB_ALGO_LS (BE_LB_KIND_CB | BE_LB_NEED_LOG | BE_LB_CB_FAS) /* log sticky */ +#define BE_LB_ALGO (BE_LB_KIND | BE_LB_NEED | BE_LB_PARM ) /* mask to clear algo */ + +/* Higher bits define how a given criterion is mapped to a server. In fact it + * designates the LB function by itself. The dynamic algorithms will also have + * the DYN bit set. These flags are automatically set at the end of the parsing. + */ +#define BE_LB_LKUP_NONE 0x00000000 /* not defined */ +#define BE_LB_LKUP_MAP 0x00100000 /* static map based lookup */ +#define BE_LB_LKUP_RRTREE 0x00200000 /* FWRR tree lookup */ +#define BE_LB_LKUP_LCTREE 0x00300000 /* FWLC tree lookup */ +#define BE_LB_LKUP_CHTREE 0x00400000 /* consistent hash */ +#define BE_LB_LKUP_FSTREE 0x00500000 /* FAS tree lookup */ +#define BE_LB_LKUP 0x00700000 /* mask to get just the LKUP value */ + +/* additional properties */ +#define BE_LB_PROP_DYN 0x00800000 /* bit to indicate a dynamic algorithm */ + +/* hash types */ +#define BE_LB_HASH_MAP 0x00000000 /* map-based hash (default) */ +#define BE_LB_HASH_CONS 0x01000000 /* consistent hashbit to indicate a dynamic algorithm */ +#define BE_LB_HASH_TYPE 0x01000000 /* get/clear hash types */ + +/* additional modifier on top of the hash function (only avalanche right now) */ +#define BE_LB_HMOD_AVAL 0x02000000 /* avalanche modifier */ +#define BE_LB_HASH_MOD 0x02000000 /* get/clear hash modifier */ + +/* BE_LB_HFCN_* is the hash function, to be used with BE_LB_HASH_FUNC */ +#define BE_LB_HFCN_SDBM 0x00000000 /* sdbm hash */ +#define BE_LB_HFCN_DJB2 0x04000000 /* djb2 hash */ +#define BE_LB_HFCN_WT6 0x08000000 /* wt6 hash */ +#define BE_LB_HFCN_CRC32 0x0C000000 /* crc32 hash */ +#define BE_LB_HFCN_NONE 0x10000000 /* none - no hash */ +#define BE_LB_HASH_FUNC 0x1C000000 /* get/clear hash function */ + + +/* various constants */ + +/* The scale factor between user weight and effective weight allows smooth + * weight modulation even with small weights (eg: 1). It should not be too high + * though because it limits the number of servers in FWRR mode in order to + * prevent any integer overflow. The max number of servers per backend is + * limited to about (2^32-1)/256^2/scale ~= 65535.9999/scale. A scale of 16 + * looks like a good value, as it allows 4095 servers per backend while leaving + * modulation steps of about 6% for servers with the lowest weight (1). + */ +#define BE_WEIGHT_SCALE 16 + +/* LB parameters for all algorithms */ +struct lbprm { + union { /* LB parameters depending on the algo type */ + struct lb_map map; + struct lb_fwrr fwrr; + struct lb_fwlc fwlc; + struct lb_chash chash; + struct lb_fas fas; + struct { + struct server **srv; /* array containing in-use log servers */ + struct list avail; /* servers available for lb are registered in this list */ + uint32_t lastid; /* last relative id used */ + } log; /* used in log-balancing context (PR_MODE_SYSLOG backend) */ + }; + uint32_t algo; /* load balancing algorithm and variants: BE_LB_* */ + int tot_wact, tot_wbck; /* total effective weights of active and backup servers */ + int tot_weight; /* total effective weight of servers participating to LB */ + int tot_uweight; /* total user weight of servers participating to LB (for reporting) */ + int tot_used; /* total number of servers used for LB */ + int wmult; /* ratio between user weight and effective weight */ + int wdiv; /* ratio between effective weight and user weight */ + int hash_balance_factor; /* load balancing factor * 100, 0 if disabled */ + struct sample_expr *expr; /* sample expression for "balance hash" */ + char *arg_str; /* name of the URL parameter/header/cookie used for hashing */ + int arg_len; /* strlen(arg_str), computed only once */ + int arg_opt1; /* extra option 1 for the LB algo (algo-specific) */ + int arg_opt2; /* extra option 2 for the LB algo (algo-specific) */ + int arg_opt3; /* extra option 3 for the LB algo (algo-specific) */ + __decl_thread(HA_RWLOCK_T lock); + struct server *fbck; /* first backup server when !PR_O_USE_ALL_BK, or NULL */ + + /* Call backs for some actions. Any of them may be NULL (thus should be ignored). + * Those marked "srvlock" will need to be called with the server lock held. + * The other ones might take it themselves if needed. + */ + void (*update_server_eweight)(struct server *); /* to be called after eweight change // srvlock */ + void (*set_server_status_up)(struct server *); /* to be called after status changes to UP // srvlock */ + void (*set_server_status_down)(struct server *); /* to be called after status changes to DOWN // srvlock */ + void (*server_take_conn)(struct server *); /* to be called when connection is assigned */ + void (*server_drop_conn)(struct server *); /* to be called when connection is dropped */ +}; + +#endif /* _HAPROXY_BACKEND_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/backend.h b/include/haproxy/backend.h new file mode 100644 index 0000000..4ab9170 --- /dev/null +++ b/include/haproxy/backend.h @@ -0,0 +1,158 @@ +/* + * include/haproxy/backend.h + * Functions prototypes for the backend. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_BACKEND_H +#define _HAPROXY_BACKEND_H + +#include <haproxy/api.h> +#include <haproxy/backend-t.h> +#include <haproxy/clock.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> +#include <haproxy/stream-t.h> +#include <haproxy/time.h> + +int assign_server(struct stream *s); +int assign_server_address(struct stream *s); +int assign_server_and_queue(struct stream *s); +int alloc_bind_address(struct sockaddr_storage **ss, + struct server *srv, struct proxy *be, + struct stream *s); +int srv_redispatch_connect(struct stream *t); +void back_try_conn_req(struct stream *s); +void back_handle_st_req(struct stream *s); +void back_handle_st_con(struct stream *s); +void back_handle_st_rdy(struct stream *s); +void back_handle_st_cer(struct stream *s); + +const char *backend_lb_algo_str(int algo); +int backend_parse_balance(const char **args, char **err, struct proxy *curproxy); +int tcp_persist_rdp_cookie(struct stream *s, struct channel *req, int an_bit); + +int be_downtime(struct proxy *px); +void recount_servers(struct proxy *px); +void update_backend_weight(struct proxy *px); +int be_lastsession(const struct proxy *be); + +/* Returns number of usable servers in backend */ +static inline int be_usable_srv(struct proxy *be) +{ + if (be->flags & PR_FL_DISABLED) + return 0; + else if (be->srv_act) + return be->srv_act; + else if (be->lbprm.fbck) + return 1; + else + return be->srv_bck; +} + +/* set the time of last session on the backend */ +static inline void be_set_sess_last(struct proxy *be) +{ + be->be_counters.last_sess = ns_to_sec(now_ns); +} + +/* This function returns non-zero if the designated server will be + * usable for LB according to pending weight and state. + * Otherwise it returns 0. + */ +static inline int srv_willbe_usable(const struct server *srv) +{ + enum srv_state state = srv->next_state; + + if (!srv->next_eweight) + return 0; + if (srv->next_admin & SRV_ADMF_MAINT) + return 0; + if (srv->next_admin & SRV_ADMF_DRAIN) + return 0; + switch (state) { + case SRV_ST_STARTING: + case SRV_ST_RUNNING: + return 1; + case SRV_ST_STOPPING: + case SRV_ST_STOPPED: + return 0; + } + return 0; +} + +/* This function returns non-zero if the designated server was usable for LB + * according to its current weight and state. Otherwise it returns 0. + */ +static inline int srv_currently_usable(const struct server *srv) +{ + enum srv_state state = srv->cur_state; + + if (!srv->cur_eweight) + return 0; + if (srv->cur_admin & SRV_ADMF_MAINT) + return 0; + if (srv->cur_admin & SRV_ADMF_DRAIN) + return 0; + switch (state) { + case SRV_ST_STARTING: + case SRV_ST_RUNNING: + return 1; + case SRV_ST_STOPPING: + case SRV_ST_STOPPED: + return 0; + } + return 0; +} + +/* This function commits the next server state and weight onto the current + * ones in order to detect future changes. The server's lock is expected to + * be held when calling this function. + */ +static inline void srv_lb_commit_status(struct server *srv) +{ + srv->cur_state = srv->next_state; + srv->cur_admin = srv->next_admin; + srv->cur_eweight = srv->next_eweight; +} + +/* This function returns true when a server has experienced a change since last + * commit on its state or weight, otherwise zero. + */ +static inline int srv_lb_status_changed(const struct server *srv) +{ + return (srv->next_state != srv->cur_state || + srv->next_admin != srv->cur_admin || + srv->next_eweight != srv->cur_eweight); +} + +/* sends a log message when a backend goes down, and also sets last + * change date. + */ +void set_backend_down(struct proxy *be); + +unsigned int gen_hash(const struct proxy* px, const char* key, unsigned long len); + +#endif /* _HAPROXY_BACKEND_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/base64.h b/include/haproxy/base64.h new file mode 100644 index 0000000..ace6063 --- /dev/null +++ b/include/haproxy/base64.h @@ -0,0 +1,28 @@ +/* + * include/haproxy/base64.h + * Ascii to Base64 conversion as described in RFC1421. + * + * Copyright 2006-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_BASE64_H +#define _HAPROXY_BASE64_H + +#include <haproxy/api.h> + +int a2base64(char *in, int ilen, char *out, int olen); +int a2base64url(const char *in, size_t ilen, char *out, size_t olen); +int base64dec(const char *in, size_t ilen, char *out, size_t olen); +int base64urldec(const char *in, size_t ilen, char *out, size_t olen); +const char *s30tob64(int in, char *out); +int b64tos30(const char *in); + +extern const char base64tab[]; + +#endif /* _HAPROXY_BASE64_H */ diff --git a/include/haproxy/buf-t.h b/include/haproxy/buf-t.h new file mode 100644 index 0000000..3c0f8b5 --- /dev/null +++ b/include/haproxy/buf-t.h @@ -0,0 +1,62 @@ +/* + * include/haproxy/buf-t.h + * Simple buffer handling - types definitions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_BUF_T_H +#define _HAPROXY_BUF_T_H + +#include <haproxy/api-t.h> + +/* Structure defining a buffer's head */ +struct buffer { + size_t size; /* buffer size in bytes */ + char *area; /* points to <size> bytes */ + size_t data; /* amount of data after head including wrapping */ + size_t head; /* start offset of remaining data relative to area */ +}; + +/* A buffer may be in 3 different states : + * - unallocated : size == 0, area == 0 (b_is_null() is true) + * - waiting : size == 0, area != 0 (b_is_null() is true) + * - allocated : size > 0, area > 0 (b_is_null() is false) + */ + +/* initializers for certain buffer states. It is important that the NULL buffer + * remains the one with all fields initialized to zero so that a calloc() or a + * memset() on a struct automatically sets a NULL buffer. + */ +#define BUF_NULL ((struct buffer){ }) +#define BUF_WANTED ((struct buffer){ .area = (char *)1 }) +#define BUF_RING ((struct buffer){ .area = (char *)2 }) + +#endif /* _HAPROXY_BUF_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/buf.h b/include/haproxy/buf.h new file mode 100644 index 0000000..e98161e --- /dev/null +++ b/include/haproxy/buf.h @@ -0,0 +1,1161 @@ +/* + * include/haproxy/buf.h + * Simple buffer handling - functions definitions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_BUF_H +#define _HAPROXY_BUF_H + +#include <sys/types.h> +#include <string.h> +#include <haproxy/api.h> +#include <haproxy/buf-t.h> + +/***************************************************************************/ +/* Functions used to compute offsets and pointers. Most of them exist in */ +/* both wrapping-safe and unchecked ("__" prefix) variants. Some returning */ +/* a pointer are also provided with an "_ofs" suffix when they return an */ +/* offset relative to the storage area. */ +/***************************************************************************/ + +/* b_is_null() : returns true if (and only if) the buffer is not yet allocated + * and thus has an empty size. Its pointer may then be anything, including NULL + * (unallocated) or an invalid pointer such as (char*)1 (allocation pending). + */ +static inline int b_is_null(const struct buffer *buf) +{ + return buf->size == 0; +} + +/* b_orig() : returns the pointer to the origin of the storage, which is the + * location of byte at offset zero. This is mostly used by functions which + * handle the wrapping by themselves. + */ +static inline char *b_orig(const struct buffer *b) +{ + return b->area; +} + +/* b_size() : returns the size of the buffer. */ +static inline size_t b_size(const struct buffer *b) +{ + return b->size; +} + +/* b_wrap() : returns the pointer to the wrapping position of the buffer area, + * which is by definition the first byte not part of the buffer. + */ +static inline char *b_wrap(const struct buffer *b) +{ + return b->area + b->size; +} + +/* b_data() : returns the number of bytes present in the buffer. */ +static inline size_t b_data(const struct buffer *b) +{ + return b->data; +} + +/* b_room() : returns the amount of room left in the buffer */ +static inline size_t b_room(const struct buffer *b) +{ + BUG_ON_HOT(b->data > b->size); + return b->size - b_data(b); +} + +/* b_full() : returns true if the buffer is full. */ +static inline size_t b_full(const struct buffer *b) +{ + return !b_room(b); +} + + +/* b_stop() : returns the pointer to the byte following the end of the buffer, + * which may be out of the buffer if the buffer ends on the last byte of the + * area. + */ +static inline size_t __b_stop_ofs(const struct buffer *b) +{ + return b->head + b->data; +} + +static inline const char *__b_stop(const struct buffer *b) +{ + return b_orig(b) + __b_stop_ofs(b); +} + +static inline size_t b_stop_ofs(const struct buffer *b) +{ + size_t stop = __b_stop_ofs(b); + + if (stop > b->size) + stop -= b->size; + return stop; +} + +static inline const char *b_stop(const struct buffer *b) +{ + return b_orig(b) + b_stop_ofs(b); +} + + +/* b_peek() : returns a pointer to the data at position <ofs> relative to the + * head of the buffer. Will typically point to input data if called with the + * amount of output data. The wrapped versions will only support wrapping once + * before the beginning or after the end. + */ +static inline size_t __b_peek_ofs(const struct buffer *b, size_t ofs) +{ + return b->head + ofs; +} + +static inline char *__b_peek(const struct buffer *b, size_t ofs) +{ + return b_orig(b) + __b_peek_ofs(b, ofs); +} + +static inline size_t b_peek_ofs(const struct buffer *b, size_t ofs) +{ + size_t ret = __b_peek_ofs(b, ofs); + + if (ret >= b->size) + ret -= b->size; + + return ret; +} + +static inline char *b_peek(const struct buffer *b, size_t ofs) +{ + return b_orig(b) + b_peek_ofs(b, ofs); +} + + +/* b_head() : returns the pointer to the buffer's head, which is the location + * of the next byte to be dequeued. Note that for buffers of size zero, the + * returned pointer may be outside of the buffer or even invalid. + */ +static inline size_t __b_head_ofs(const struct buffer *b) +{ + return b->head; +} + +static inline char *__b_head(const struct buffer *b) +{ + return b_orig(b) + __b_head_ofs(b); +} + +static inline size_t b_head_ofs(const struct buffer *b) +{ + return __b_head_ofs(b); +} + +static inline char *b_head(const struct buffer *b) +{ + return __b_head(b); +} + + +/* b_tail() : returns the pointer to the tail of the buffer, which is the + * location of the first byte where it is possible to enqueue new data. Note + * that for buffers of size zero, the returned pointer may be outside of the + * buffer or even invalid. + */ +static inline size_t __b_tail_ofs(const struct buffer *b) +{ + return __b_peek_ofs(b, b_data(b)); +} + +static inline char *__b_tail(const struct buffer *b) +{ + return __b_peek(b, b_data(b)); +} + +static inline size_t b_tail_ofs(const struct buffer *b) +{ + return b_peek_ofs(b, b_data(b)); +} + +static inline char *b_tail(const struct buffer *b) +{ + return b_peek(b, b_data(b)); +} + + +/* b_next() : for an absolute pointer <p> or a relative offset <o> pointing to + * a valid location within buffer <b>, returns either the absolute pointer or + * the relative offset pointing to the next byte, which usually is at (p + 1) + * unless p reaches the wrapping point and wrapping is needed. + */ +static inline size_t b_next_ofs(const struct buffer *b, size_t o) +{ + o++; + BUG_ON_HOT(o > b->size); + if (o == b->size) + o = 0; + return o; +} + +static inline char *b_next(const struct buffer *b, const char *p) +{ + p++; + BUG_ON_HOT(p > b_wrap(b)); + if (p == b_wrap(b)) + p = b_orig(b); + return (char *)p; +} + +/* b_dist() : returns the distance between two pointers, taking into account + * the ability to wrap around the buffer's end. The operation is not defined if + * either of the pointers does not belong to the buffer or if their distance is + * greater than the buffer's size. + */ +static inline size_t b_dist(const struct buffer *b, const char *from, const char *to) +{ + ssize_t dist = to - from; + + BUG_ON_HOT((dist > 0 && dist > b_size(b)) || (dist < 0 && -dist > b_size(b))); + dist += dist < 0 ? b_size(b) : 0; + return dist; +} + +/* b_almost_full() : returns 1 if the buffer uses at least 3/4 of its capacity, + * otherwise zero. Buffers of size zero are considered full. + */ +static inline int b_almost_full(const struct buffer *b) +{ + BUG_ON_HOT(b->data > b->size); + return b_data(b) >= b_size(b) * 3 / 4; +} + +/* b_space_wraps() : returns non-zero only if the buffer's free space wraps : + * [ |xxxx| ] => yes + * [xxxx| ] => no + * [ |xxxx] => no + * [xxxx| |xxxx] => no + * [xxxxxxxxxx|xxxxxxxxxxx] => no + * + * So the only case where the buffer does not wrap is when there's data either + * at the beginning or at the end of the buffer. Thus we have this : + * - if (head <= 0) ==> doesn't wrap + * - if (tail >= size) ==> doesn't wrap + * - otherwise wraps + */ +static inline int b_space_wraps(const struct buffer *b) +{ + BUG_ON_HOT(b->data > b->size); + if ((ssize_t)__b_head_ofs(b) <= 0) + return 0; + if (__b_tail_ofs(b) >= b_size(b)) + return 0; + return 1; +} + +/* b_contig_data() : returns the amount of data that can contiguously be read + * at once starting from a relative offset <start> (which allows to easily + * pre-compute blocks for memcpy). The start point will typically contain the + * amount of past data already returned by a previous call to this function. + */ +static inline size_t b_contig_data(const struct buffer *b, size_t start) +{ + size_t data = b_wrap(b) - b_peek(b, start); + size_t limit = b_data(b) - start; + + if (data > limit) + data = limit; + return data; +} + +/* b_contig_space() : returns the amount of bytes that can be appended to the + * buffer at once. We have 8 possible cases : + * + * [____________________] return size + * [______|_____________] return size - tail_ofs + * [XXXXXX|_____________] return size - tail_ofs + * [___|XXXXXX|_________] return size - tail_ofs + * [______________XXXXXX] return head_ofs + * [XXXX|___________|XXX] return head_ofs - tail_ofs + * [XXXXXXXXXX|XXXXXXXXX] return 0 + * [XXXXXXXXXXXXXXXXXXXX] return 0 + */ +static inline size_t b_contig_space(const struct buffer *b) +{ + size_t left, right; + + BUG_ON_HOT(b->data > b->size); + + right = b_head_ofs(b); + left = right + b_data(b); + + left = b_size(b) - left; + if ((ssize_t)left <= 0) + left += right; + return left; +} + +/* b_getblk() : gets one full block of data at once from a buffer, starting + * from offset <offset> after the buffer's head, and limited to no more than + * <len> bytes. The caller is responsible for ensuring that neither <offset> + * nor <offset>+<len> exceed the total number of bytes available in the buffer. + * Return values : + * >0 : number of bytes read, equal to requested size. + * =0 : not enough data available. <blk> is left undefined. + * The buffer is left unaffected. + */ +static inline size_t b_getblk(const struct buffer *buf, char *blk, size_t len, size_t offset) +{ + size_t firstblock; + + BUG_ON(buf->data > buf->size); + BUG_ON(offset > buf->data); + BUG_ON(offset + len > buf->data); + + if (len + offset > b_data(buf)) + return 0; + + firstblock = b_wrap(buf) - b_head(buf); + if (firstblock > offset) { + if (firstblock >= len + offset) { + memcpy(blk, b_head(buf) + offset, len); + return len; + } + + memcpy(blk, b_head(buf) + offset, firstblock - offset); + memcpy(blk + firstblock - offset, b_orig(buf), len - firstblock + offset); + return len; + } + + memcpy(blk, b_orig(buf) + offset - firstblock, len); + return len; +} + +/* b_getblk_nc() : gets one or two blocks of data at once from a buffer, + * starting from offset <ofs> after the beginning of its output, and limited to + * no more than <max> bytes. The caller is responsible for ensuring that + * neither <ofs> nor <ofs>+<max> exceed the total number of bytes available in + * the buffer. Return values : + * >0 : number of blocks filled (1 or 2). blk1 is always filled before blk2. + * =0 : not enough data available. <blk*> are left undefined. + * The buffer is left unaffected. Unused buffers are left in an undefined state. + */ +static inline size_t b_getblk_nc(const struct buffer *buf, const char **blk1, size_t *len1, const char **blk2, size_t *len2, size_t ofs, size_t max) +{ + size_t l1; + + BUG_ON_HOT(buf->data > buf->size); + BUG_ON_HOT(ofs > buf->data); + BUG_ON_HOT(ofs + max > buf->data); + + if (!max) + return 0; + + *blk1 = b_peek(buf, ofs); + l1 = b_wrap(buf) - *blk1; + if (l1 < max) { + *len1 = l1; + *len2 = max - l1; + *blk2 = b_orig(buf); + return 2; + } + *len1 = max; + return 1; +} + + +/*********************************************/ +/* Functions used to modify the buffer state */ +/*********************************************/ + +/* b_reset() : resets a buffer. The size is not touched. */ +static inline void b_reset(struct buffer *b) +{ + b->head = 0; + b->data = 0; +} + +/* b_make() : make a buffer from all parameters */ +static inline struct buffer b_make(char *area, size_t size, size_t head, size_t data) +{ + struct buffer b; + + b.area = area; + b.size = size; + b.head = head; + b.data = data; + return b; +} + +/* b_sub() : decreases the buffer length by <count> */ +static inline void b_sub(struct buffer *b, size_t count) +{ + BUG_ON_HOT(b->data < count); + b->data -= count; +} + +/* b_add() : increase the buffer length by <count> */ +static inline void b_add(struct buffer *b, size_t count) +{ + BUG_ON_HOT(b->data + count > b->size); + b->data += count; +} + +/* b_set_data() : sets the buffer's length */ +static inline void b_set_data(struct buffer *b, size_t len) +{ + BUG_ON_HOT(len > b->size); + b->data = len; +} + +/* b_del() : skips <del> bytes in a buffer <b>. Covers both the output and the + * input parts so it's up to the caller to know where it plays and that <del> + * is always smaller than the amount of data in the buffer. + */ +static inline void b_del(struct buffer *b, size_t del) +{ + BUG_ON_HOT(b->data < del); + b->data -= del; + b->head += del; + if (b->head >= b->size) + b->head -= b->size; +} + +/* b_realign_if_empty() : realigns a buffer if it's empty */ +static inline void b_realign_if_empty(struct buffer *b) +{ + if (!b_data(b)) + b->head = 0; +} + +/* b_slow_realign() : this function realigns a possibly wrapping buffer so that + * the part remaining to be parsed is contiguous and starts at the beginning of + * the buffer and the already parsed output part ends at the end of the buffer. + * This provides the best conditions since it allows the largest inputs to be + * processed at once and ensures that once the output data leaves, the whole + * buffer is available at once. The number of output bytes supposedly present + * at the beginning of the buffer and which need to be moved to the end must be + * passed in <output>. A temporary swap area at least as large as b->size must + * be provided in <swap>. It's up to the caller to ensure <output> is no larger + * than the difference between the whole buffer's length and its input. + */ +static inline void b_slow_realign(struct buffer *b, char *swap, size_t output) +{ + size_t block1 = output; + size_t block2 = 0; + + BUG_ON_HOT(b->data > b->size); + + /* process output data in two steps to cover wrapping */ + if (block1 > b_size(b) - b_head_ofs(b)) { + block2 = b_peek_ofs(b, block1); + block1 -= block2; + } + memcpy(swap + b_size(b) - output, b_head(b), block1); + memcpy(swap + b_size(b) - block2, b_orig(b), block2); + + /* process input data in two steps to cover wrapping */ + block1 = b_data(b) - output; + block2 = 0; + + if (block1 > b_tail_ofs(b)) { + block2 = b_tail_ofs(b); + block1 = block1 - block2; + } + memcpy(swap, b_peek(b, output), block1); + memcpy(swap + block1, b_orig(b), block2); + + /* reinject changes into the buffer */ + memcpy(b_orig(b), swap, b_data(b) - output); + memcpy(b_wrap(b) - output, swap + b_size(b) - output, output); + + b->head = (output ? b_size(b) - output : 0); +} + +/* b_slow_realign_ofs() : this function realigns a possibly wrapping buffer + * setting its new head at <ofs>. Depending of the <ofs> value, the resulting + * buffer may also wrap. A temporary swap area at least as large as b->size must + * be provided in <swap>. It's up to the caller to ensuze <ofs> is not larger + * than b->size. + */ +static inline void b_slow_realign_ofs(struct buffer *b, char *swap, size_t ofs) +{ + size_t block1 = b_data(b); + size_t block2 = 0; + + BUG_ON_HOT(b->data > b->size); + BUG_ON_HOT(ofs > b->size); + + if (__b_tail_ofs(b) >= b_size(b)) { + block2 = b_tail_ofs(b); + block1 -= block2; + } + memcpy(swap, b_head(b), block1); + memcpy(swap + block1, b_orig(b), block2); + + block1 = b_data(b); + block2 = 0; + if (block1 > b_size(b) - ofs) { + block1 = b_size(b) - ofs; + block2 = b_data(b) - block1; + } + memcpy(b_orig(b) + ofs, swap, block1); + memcpy(b_orig(b), swap + block1, block2); + + b->head = ofs; +} + + +/* b_putchar() : tries to append char <c> at the end of buffer <b>. Supports + * wrapping. Data are truncated if buffer is full. + */ +static inline void b_putchr(struct buffer *b, char c) +{ + if (b_full(b)) + return; + *b_tail(b) = c; + b->data++; +} + +/* __b_putblk() : tries to append <len> bytes from block <blk> to the end of + * buffer <b> without checking for free space (it's up to the caller to do it). + * Supports wrapping. It must not be called with len == 0. + */ +static inline void __b_putblk(struct buffer *b, const char *blk, size_t len) +{ + size_t half = b_contig_space(b); + + BUG_ON(b_data(b) + len > b_size(b)); + + if (half > len) + half = len; + + memcpy(b_tail(b), blk, half); + + if (len > half) + memcpy(b_peek(b, b_data(b) + half), blk + half, len - half); + b->data += len; +} + +/* b_putblk() : tries to append block <blk> at the end of buffer <b>. Supports + * wrapping. Data are truncated if buffer is too short. It returns the number + * of bytes copied. + */ +static inline size_t b_putblk(struct buffer *b, const char *blk, size_t len) +{ + if (len > b_room(b)) + len = b_room(b); + if (len) + __b_putblk(b, blk, len); + return len; +} + +/* b_xfer() : transfers at most <count> bytes from buffer <src> to buffer <dst> + * and returns the number of bytes copied. The bytes are removed from <src> and + * added to <dst>. The caller is responsible for ensuring that <count> is not + * larger than b_room(dst). Whenever possible (if the destination is empty and + * at least as much as the source was requested), the buffers are simply + * swapped instead of copied. + */ +static inline size_t b_xfer(struct buffer *dst, struct buffer *src, size_t count) +{ + size_t ret, block1, block2; + + ret = 0; + if (!count) + goto leave; + + ret = b_data(src); + if (!ret) + goto leave; + + if (ret > count) + ret = count; + else if (!b_data(dst)) { + /* zero copy is possible by just swapping buffers */ + struct buffer tmp = *dst; + *dst = *src; + *src = tmp; + goto leave; + } + + block1 = b_contig_data(src, 0); + if (block1 > ret) + block1 = ret; + block2 = ret - block1; + + if (block1) + __b_putblk(dst, b_head(src), block1); + + if (block2) + __b_putblk(dst, b_peek(src, block1), block2); + + b_del(src, ret); + leave: + return ret; +} + +/* b_ncat() : Copy <count> from <src> buffer at the end of <dst> buffer. + * The caller is responsible for ensuring that <count> is not larger than + * b_room(dst). + * Returns the number of bytes copied. + */ +static inline size_t b_ncat(struct buffer *dst, struct buffer *src, size_t count) +{ + size_t ret, block1, block2; + + ret = 0; + if (!count) + goto leave; + + ret = b_data(src); + if (!ret) + goto leave; + + if (ret > count) + ret = count; + block1 = b_contig_data(src, 0); + if (block1 > ret) + block1 = ret; + block2 = ret - block1; + + if (block1) + __b_putblk(dst, b_head(src), block1); + + if (block2) + __b_putblk(dst, b_peek(src, block1), block2); + + leave: + return ret; +} + +/* b_force_xfer() : same as b_xfer() but without zero copy. + * The caller is responsible for ensuring that <count> is not + * larger than b_room(dst). + */ +static inline size_t b_force_xfer(struct buffer *dst, struct buffer *src, size_t count) +{ + size_t ret; + + ret = b_ncat(dst, src, count); + b_del(src, ret); + + return ret; +} + + +/* Moves <len> bytes from absolute position <src> of buffer <b> by <shift> + * bytes, while supporting wrapping of both the source and the destination. + * The position is relative to the buffer's origin and may overlap with the + * target position. The <shift>'s absolute value must be strictly lower than + * the buffer's size. The main purpose is to aggregate data block during + * parsing while removing unused delimiters. The buffer's length is not + * modified, and the caller must take care of size adjustments and holes by + * itself. + */ +static inline void b_move(const struct buffer *b, size_t src, size_t len, ssize_t shift) +{ + char *orig = b_orig(b); + size_t size = b_size(b); + size_t dst = src + size + shift; + size_t cnt; + + BUG_ON(len > size); + + if (dst >= size) + dst -= size; + + if (shift < 0) { + BUG_ON(-shift >= size); + /* copy from left to right */ + for (; (cnt = len); len -= cnt) { + if (cnt > size - src) + cnt = size - src; + if (cnt > size - dst) + cnt = size - dst; + + memmove(orig + dst, orig + src, cnt); + dst += cnt; + src += cnt; + if (dst >= size) + dst -= size; + if (src >= size) + src -= size; + } + } + else if (shift > 0) { + BUG_ON(shift >= size); + /* copy from right to left */ + for (; (cnt = len); len -= cnt) { + size_t src_end = src + len; + size_t dst_end = dst + len; + + if (dst_end > size) + dst_end -= size; + if (src_end > size) + src_end -= size; + + if (cnt > dst_end) + cnt = dst_end; + if (cnt > src_end) + cnt = src_end; + + memmove(orig + dst_end - cnt, orig + src_end - cnt, cnt); + } + } +} + +/* b_rep_blk() : writes the block <blk> at position <pos> which must be in + * buffer <b>, and moves the part between <end> and the buffer's tail just + * after the end of the copy of <blk>. This effectively replaces the part + * located between <pos> and <end> with a copy of <blk> of length <len>. The + * buffer's length is automatically updated. This is used to replace a block + * with another one inside a buffer. The shift value (positive or negative) is + * returned. If there's no space left, the move is not done. If <len> is null, + * the <blk> pointer is allowed to be null, in order to erase a block. + */ +static inline int b_rep_blk(struct buffer *b, char *pos, char *end, const char *blk, size_t len) +{ + int delta; + + BUG_ON(pos < b->area || pos >= b->area + b->size); + + delta = len - (end - pos); + + if (__b_tail(b) + delta > b_wrap(b)) + return 0; /* no space left */ + + if (b_data(b) && + b_tail(b) + delta > b_head(b) && + b_head(b) >= b_tail(b)) + return 0; /* no space left before wrapping data */ + + /* first, protect the end of the buffer */ + memmove(end + delta, end, b_tail(b) - end); + + /* now, copy blk over pos */ + if (len) + memcpy(pos, blk, len); + + b_add(b, delta); + b_realign_if_empty(b); + + return delta; +} + +/* b_insert_blk(): inserts the block <blk> at the absolute offset <off> moving + * data between this offset and the buffer's tail just after the end of the copy + * of <blk>. The buffer's length is automatically updated. It Supports + * wrapping. If there are not enough space to perform the copy, 0 is + * returned. Otherwise, the number of bytes copied is returned +*/ +static inline int b_insert_blk(struct buffer *b, size_t off, const char *blk, size_t len) +{ + size_t pos; + + if (!len || len > b_room(b)) + return 0; /* nothing to copy or not enough space left */ + + pos = b_peek_ofs(b, off); + if (pos == b_tail_ofs(b)) + __b_putblk(b, blk, len); + else { + size_t delta = b_data(b) - off; + + /* first, protect the end of the buffer */ + b_move(b, pos, delta, len); + + /* change the amount of data in the buffer during the copy */ + b_sub(b, delta); + __b_putblk(b, blk, len); + b_add(b, delta); + } + return len; +} + +/* __b_put_varint(): encode 64-bit value <v> as a varint into buffer <b>. The + * caller must have checked that the encoded value fits in the buffer so that + * there are no length checks. Wrapping is supported. You don't want to use + * this function but b_put_varint() instead. + */ +static inline void __b_put_varint(struct buffer *b, uint64_t v) +{ + size_t data = b->data; + size_t size = b_size(b); + char *wrap = b_wrap(b); + char *tail = b_tail(b); + + BUG_ON_HOT(data >= size); + + if (v >= 0xF0) { + /* more than one byte, first write the 4 least significant + * bits, then follow with 7 bits per byte. + */ + *tail = v | 0xF0; + v = (v - 0xF0) >> 4; + + while (1) { + if (++tail == wrap) + tail -= size; + data++; + if (v < 0x80) + break; + *tail = v | 0x80; + v = (v - 0x80) >> 7; + } + } + + /* last byte */ + *tail = v; + BUG_ON_HOT(data >= size); + data++; + b->data = data; +} + +/* b_put_varint(): try to encode value <v> as a varint into buffer <b>. Returns + * the number of bytes written in case of success, or 0 if there is not enough + * room. Wrapping is supported. No partial writes will be performed. + */ +static inline int b_put_varint(struct buffer *b, uint64_t v) +{ + size_t data = b->data; + size_t size = b_size(b); + char *wrap = b_wrap(b); + char *tail = b_tail(b); + + if (data != size && v >= 0xF0) { + BUG_ON_HOT(data > size); + + /* more than one byte, first write the 4 least significant + * bits, then follow with 7 bits per byte. + */ + *tail = v | 0xF0; + v = (v - 0xF0) >> 4; + + while (1) { + if (++tail == wrap) + tail -= size; + data++; + if (data == size || v < 0x80) + break; + *tail = v | 0x80; + v = (v - 0x80) >> 7; + } + } + + /* last byte */ + if (data == size) + return 0; + + *tail = v; + data++; + + size = data - b->data; + b->data = data; + return size; +} + +/* b_get_varint(): try to decode a varint from buffer <b> into value <vptr>. + * Returns the number of bytes read in case of success, or 0 if there were not + * enough bytes. Wrapping is supported. No partial reads will be performed. + */ +static inline int b_get_varint(struct buffer *b, uint64_t *vptr) +{ + const uint8_t *head = (const uint8_t *)b_head(b); + const uint8_t *wrap = (const uint8_t *)b_wrap(b); + size_t data = b->data; + size_t size = b_size(b); + uint64_t v = 0; + int bits = 0; + + if (data != 0 && (*head >= 0xF0)) { + v = *head; + bits += 4; + while (1) { + if (++head == wrap) + head -= size; + data--; + if (!data || !(*head & 0x80)) + break; + v += (uint64_t)*head << bits; + bits += 7; + } + } + + /* last byte */ + if (!data) + return 0; + + v += (uint64_t)*head << bits; + *vptr = v; + data--; + size = b->data - data; + b_del(b, size); + return size; +} + +/* b_peek_varint(): try to decode a varint from buffer <b> at offset <ofs> + * relative to head, into value <vptr>. Returns the number of bytes parsed in + * case of success, or 0 if there were not enough bytes, in which case the + * contents of <vptr> are not updated. Wrapping is supported. The buffer's head + * will NOT be updated. It is illegal to call this function with <ofs> greater + * than b->data. + */ +static inline int b_peek_varint(struct buffer *b, size_t ofs, uint64_t *vptr) +{ + const uint8_t *head = (const uint8_t *)b_peek(b, ofs); + const uint8_t *wrap = (const uint8_t *)b_wrap(b); + size_t data = b_data(b) - ofs; + size_t size = b_size(b); + uint64_t v = 0; + int bits = 0; + + BUG_ON_HOT(ofs > b_data(b)); + + if (data != 0 && (*head >= 0xF0)) { + v = *head; + bits += 4; + while (1) { + if (++head == wrap) + head -= size; + data--; + if (!data || !(*head & 0x80)) + break; + v += (uint64_t)*head << bits; + bits += 7; + } + } + + /* last byte */ + if (!data) + return 0; + + v += (uint64_t)*head << bits; + *vptr = v; + data--; + size = b->data - ofs - data; + return size; +} + + +/* + * Buffer ring management. + * + * A buffer ring is a circular list of buffers, with a head buffer (the oldest, + * being read from) and a tail (the newest, being written to). Such a ring is + * declared as an array of buffers. The first element in the array is the root + * and is used differently. It stores the following elements : + * - size : number of allocated elements in the array, including the root + * - area : magic value BUF_RING (just to help debugging) + * - head : position of the head in the array (starts at one) + * - data : position of the tail in the array (starts at one). + * + * Note that contrary to a linear buffer, head and tail may be equal with room + * available, since the producer is expected to fill the tail. Also, the tail + * might pretty much be equal to BUF_WANTED if an allocation is pending, in + * which case it's illegal to try to allocate past this point (only one entry + * may be subscribed for allocation). It is illegal to allocate a buffer after + * an empty one, so that BUF_NULL is always the last buffer. It is also illegal + * to remove elements without freeing the buffers. Buffers between <tail> and + * <head> are in an undefined state, but <tail> and <head> are always valid. + * A ring may not contain less than 2 elements, since the root is mandatory, + * and at least one entry is required to always present a valid buffer. + * + * Given that buffers are 16- or 32- bytes long, it's convenient to set the + * size of the array to 2^N in order to keep (2^N)-1 elements, totalizing + * 2^N*16(or 32) bytes. For example on a 64-bit system, a ring of 31 usable + * buffers takes 1024 bytes. + */ + +/* Initialization of a ring, the size argument contains the number of allocated + * elements, including the root. There must always be at least 2 elements, one + * for the root and one for storage. + */ +static inline void br_init(struct buffer *r, size_t size) +{ + BUG_ON(size < 2); + + r->size = size; + r->area = BUF_RING.area; + r->head = r->data = 1; + r[1] = BUF_NULL; +} + +/* Returns number of elements in the ring, root included */ +static inline unsigned int br_size(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r->size; +} + +/* Returns true if no more buffers may be added */ +static inline unsigned int br_full(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r->data + 1 == r->head || r->data + 1 == r->head - 1 + r->size; +} + +/* Returns the number of buffers present */ +static inline unsigned int br_count(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + if (r->data >= r->head) + return r->data - r->head + 1; + else + return r->data + r->size - r->head; +} + +/* Returns true if a single buffer is assigned */ +static inline unsigned int br_single(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r->data == r->head; +} + +/* Returns the index of the ring's head buffer */ +static inline unsigned int br_head_idx(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r->head; +} + +/* Returns the index of the ring's tail buffer */ +static inline unsigned int br_tail_idx(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r->data; +} + +/* Returns a pointer to the ring's head buffer */ +static inline struct buffer *br_head(struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r + br_head_idx(r); +} + +/* Returns a pointer to the ring's tail buffer */ +static inline struct buffer *br_tail(struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return r + br_tail_idx(r); +} + +/* Returns the amount of data of the ring's HEAD buffer */ +static inline unsigned int br_data(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + return b_data(r + br_head_idx(r)); +} + +/* Returns non-zero if the ring is non-full or its tail has some room */ +static inline unsigned int br_has_room(const struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + if (!br_full(r)) + return 1; + return b_room(r + br_tail_idx(r)); +} + +/* Advances the ring's tail if it points to a non-empty buffer, and returns the + * buffer, or NULL if the ring is full or the tail buffer is already empty. A + * new buffer is initialized to BUF_NULL before being returned. This is to be + * used after failing to append data, in order to decide to retry or not. + */ +static inline struct buffer *br_tail_add(struct buffer *r) +{ + struct buffer *b; + + BUG_ON_HOT(r->area != BUF_RING.area); + + b = br_tail(r); + if (!b_size(b)) + return NULL; + + if (br_full(r)) + return NULL; + + r->data++; + if (r->data >= r->size) + r->data = 1; + + b = br_tail(r); + *b = BUF_NULL; + return b; +} + +/* Extracts the ring's head buffer and returns it. The last buffer (tail) is + * never removed but it is returned. This guarantees that we stop on BUF_WANTED + * or BUF_EMPTY and that at the end a valid buffer remains present. This is + * used for pre-extraction during a free() loop for example. The caller is + * expected to detect the end (e.g. using bsize() since b_free() voids the + * buffer). + */ +static inline struct buffer *br_head_pick(struct buffer *r) +{ + struct buffer *b; + + BUG_ON_HOT(r->area != BUF_RING.area); + + b = br_head(r); + if (r->head != r->data) { + r->head++; + if (r->head >= r->size) + r->head = 1; + } + return b; +} + +/* Advances the ring's head and returns the next buffer, unless it's already + * the tail, in which case the tail itself is returned. This is used for post- + * parsing deletion. The caller is expected to detect the end (e.g. a parser + * will typically purge the head before proceeding). + */ +static inline struct buffer *br_del_head(struct buffer *r) +{ + BUG_ON_HOT(r->area != BUF_RING.area); + + if (r->head != r->data) { + r->head++; + if (r->head >= r->size) + r->head = 1; + } + return br_head(r); +} + +#endif /* _HAPROXY_BUF_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/bug.h b/include/haproxy/bug.h new file mode 100644 index 0000000..1356acf --- /dev/null +++ b/include/haproxy/bug.h @@ -0,0 +1,479 @@ +/* + * include/haproxy/bug.h + * Assertions and instant crash macros needed everywhere. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_BUG_H +#define _HAPROXY_BUG_H + +#include <haproxy/atomic.h> +#include <haproxy/compiler.h> + +/* quick debugging hack, should really be removed ASAP */ +#ifdef DEBUG_FULL +#define DPRINTF(x...) fprintf(x) +#else +#define DPRINTF(x...) +#endif + +#define DUMP_TRACE() do { extern void ha_backtrace_to_stderr(void); ha_backtrace_to_stderr(); } while (0) + +/* First, let's try to handle some arch-specific crashing methods. We prefer + * the macro to the function because when opening the core, the debugger will + * directly show the calling point (e.g. the BUG_ON() condition) based on the + * line number, while the function will create new line numbers. But the + * function is needed e.g. if some pragmas are needed. + */ + +#if defined(__i386__) || defined(__x86_64__) +#define ha_crash_now() do { \ + /* ud2 opcode: 2 bytes, raises illegal instruction */ \ + __asm__ volatile(".byte 0x0f,0x0b\n"); \ + DO_NOT_FOLD(); \ + my_unreachable(); \ + } while (0) + +#elif defined(__aarch64__) +#define ha_crash_now() do { \ + /* udf#imm16: 4 bytes (), raises illegal instruction */ \ + __asm__ volatile(".byte 0x00,0x00,0x00,0x00\n"); \ + DO_NOT_FOLD(); \ + my_unreachable(); \ + } while (0) + +#else // not x86 + +/* generic implementation, causes a segfault */ +static inline __attribute((always_inline)) void ha_crash_now(void) +{ +#if __GNUC_PREREQ__(5, 0) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Warray-bounds" +#if __GNUC_PREREQ__(6, 0) +#pragma GCC diagnostic ignored "-Wnull-dereference" +#endif +#endif + *(volatile char *)1 = 0; +#if __GNUC_PREREQ__(5, 0) +#pragma GCC diagnostic pop +#endif + DO_NOT_FOLD(); + my_unreachable(); +} + +#endif // end of arch-specific ha_crash_now() definitions + +#ifdef DEBUG_USE_ABORT +/* abort() is better recognized by code analysis tools */ + +/* abort() is generally tagged noreturn, so there's no 100% safe way to prevent + * the compiler from doing a tail-merge here. Tests show that stopping folding + * just before calling abort() does work in practice at -O2, increasing the + * number of abort() calls in h3.o from 18 to 26, probably because there's no + * more savings to be made by replacing a call with a jump. However, as -Os it + * drops to 5 regardless of the build option. In order to help here, instead we + * wrap abort() into another function, with the line number stored into a local + * variable on the stack and we pretend to use it, so that unwinding the stack + * from abort() will reveal its value even if the call was folded. + */ +static __attribute__((noinline,noreturn,unused)) void abort_with_line(uint line) +{ + DISGUISE(&line); + abort(); +} + +#define ABORT_NOW() do { DUMP_TRACE(); abort_with_line(__LINE__); } while (0) +#else +/* More efficient than abort() because it does not mangle the + * stack and stops at the exact location we need. + */ +#define ABORT_NOW() do { DUMP_TRACE(); ha_crash_now(); } while (0) +#endif + +/* This is the generic low-level macro dealing with conditional warnings and + * bugs. The caller decides whether to crash or not and what prefix and suffix + * to pass. The macro returns the boolean value of the condition as an int for + * the case where it wouldn't die. The <crash> flag is made of: + * - crash & 1: crash yes/no; + * - crash & 2: taint as bug instead of warn + */ +#define _BUG_ON(cond, file, line, crash, pfx, sfx) \ + __BUG_ON(cond, file, line, crash, pfx, sfx) + +#define __BUG_ON(cond, file, line, crash, pfx, sfx) \ + (void)(unlikely(cond) ? ({ \ + complain(NULL, "\n" pfx "condition \"" #cond "\" matched at " file ":" #line "" sfx "\n", crash); \ + if (crash & 1) \ + ABORT_NOW(); \ + else \ + DUMP_TRACE(); \ + 1; /* let's return the true condition */ \ + }) : 0) + +/* This one is equivalent except that it only emits the message once by + * maintaining a static counter. This may be used with warnings to detect + * certain unexpected conditions in field. Later on, in cores it will be + * possible to verify these counters. + */ +#define _BUG_ON_ONCE(cond, file, line, crash, pfx, sfx) \ + __BUG_ON_ONCE(cond, file, line, crash, pfx, sfx) + +#define __BUG_ON_ONCE(cond, file, line, crash, pfx, sfx) \ + (void)(unlikely(cond) ? ({ \ + static int __match_count_##line; \ + complain(&__match_count_##line, "\n" pfx "condition \"" #cond "\" matched at " file ":" #line "" sfx "\n", crash); \ + if (crash & 1) \ + ABORT_NOW(); \ + else \ + DUMP_TRACE(); \ + 1; /* let's return the true condition */ \ + }) : 0) + +/* DEBUG_STRICT enables/disables runtime checks on condition <cond> + * DEBUG_STRICT_ACTION indicates the level of verification on the rules when + * <cond> is true: + * + * macro BUG_ON() WARN_ON() CHECK_IF() + * value 0 warn warn warn + * 1 CRASH warn warn + * 2 CRASH CRASH warn + * 3 CRASH CRASH CRASH + */ + +/* The macros below are for general use */ +#if defined(DEBUG_STRICT) +# if defined(DEBUG_STRICT_ACTION) && (DEBUG_STRICT_ACTION < 1) +/* Lowest level: BUG_ON() warns, WARN_ON() warns, CHECK_IF() warns */ +# define BUG_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 2, "WARNING: bug ", " (not crashing but process is untrusted now, please report to developers)") +# define WARN_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 0, "WARNING: warn ", " (please report to developers)") +# define CHECK_IF(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 0, "WARNING: check ", " (please report to developers)") +# elif !defined(DEBUG_STRICT_ACTION) || (DEBUG_STRICT_ACTION == 1) +/* default level: BUG_ON() crashes, WARN_ON() warns, CHECK_IF() warns */ +# define BUG_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 3, "FATAL: bug ", "") +# define WARN_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 0, "WARNING: warn ", " (please report to developers)") +# define CHECK_IF(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 0, "WARNING: check ", " (please report to developers)") +# elif defined(DEBUG_STRICT_ACTION) && (DEBUG_STRICT_ACTION == 2) +/* Stricter level: BUG_ON() crashes, WARN_ON() crashes, CHECK_IF() warns */ +# define BUG_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 3, "FATAL: bug ", "") +# define WARN_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 1, "FATAL: warn ", "") +# define CHECK_IF(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 0, "WARNING: check ", " (please report to developers)") +# elif defined(DEBUG_STRICT_ACTION) && (DEBUG_STRICT_ACTION >= 3) +/* Developer/CI level: BUG_ON() crashes, WARN_ON() crashes, CHECK_IF() crashes */ +# define BUG_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 3, "FATAL: bug ", "") +# define WARN_ON(cond) _BUG_ON (cond, __FILE__, __LINE__, 1, "FATAL: warn ", "") +# define CHECK_IF(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 1, "FATAL: check ", "") +# endif +#else +# define BUG_ON(cond) do { (void)sizeof(cond); } while (0) +# define WARN_ON(cond) do { (void)sizeof(cond); } while (0) +# define CHECK_IF(cond) do { (void)sizeof(cond); } while (0) +#endif + +/* These macros are only for hot paths and remain disabled unless DEBUG_STRICT is 2 or above. + * Only developers/CI should use these levels as they may significantly impact performance by + * enabling checks in sensitive areas. + */ +#if defined(DEBUG_STRICT) && (DEBUG_STRICT > 1) +# if defined(DEBUG_STRICT_ACTION) && (DEBUG_STRICT_ACTION < 1) +/* Lowest level: BUG_ON() warns, CHECK_IF() warns */ +# define BUG_ON_HOT(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 2, "WARNING: bug ", " (not crashing but process is untrusted now, please report to developers)") +# define CHECK_IF_HOT(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 0, "WARNING: check ", " (please report to developers)") +# elif !defined(DEBUG_STRICT_ACTION) || (DEBUG_STRICT_ACTION < 3) +/* default level: BUG_ON() crashes, CHECK_IF() warns */ +# define BUG_ON_HOT(cond) _BUG_ON (cond, __FILE__, __LINE__, 3, "FATAL: bug ", "") +# define CHECK_IF_HOT(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 0, "WARNING: check ", " (please report to developers)") +# elif defined(DEBUG_STRICT_ACTION) && (DEBUG_STRICT_ACTION >= 3) +/* Developer/CI level: BUG_ON() crashes, CHECK_IF() crashes */ +# define BUG_ON_HOT(cond) _BUG_ON (cond, __FILE__, __LINE__, 3, "FATAL: bug ", "") +# define CHECK_IF_HOT(cond) _BUG_ON_ONCE(cond, __FILE__, __LINE__, 1, "FATAL: check ", "") +# endif +#else +# define BUG_ON_HOT(cond) do { (void)sizeof(cond); } while (0) +# define CHECK_IF_HOT(cond) do { (void)sizeof(cond); } while (0) +#endif + + +/* When not optimizing, clang won't remove that code, so only compile it in when optimizing */ +#if defined(__GNUC__) && defined(__OPTIMIZE__) +#define HA_LINK_ERROR(what) \ + do { \ + /* provoke a build-time error */ \ + extern volatile int what; \ + what = 1; \ + } while (0) +#else +#define HA_LINK_ERROR(what) \ + do { \ + } while (0) +#endif /* __OPTIMIZE__ */ + +/* more reliable free() that clears the pointer */ +#define ha_free(x) do { \ + typeof(x) __x = (x); \ + if (__builtin_constant_p((x)) || __builtin_constant_p(*(x))) { \ + HA_LINK_ERROR(call_to_ha_free_attempts_to_free_a_constant); \ + } \ + free(*__x); \ + *__x = NULL; \ + } while (0) + +/* describes a call place in the code, for example for tracing memory + * allocations or task wakeups. These must be declared static const. + */ +struct ha_caller { + const char *func; // function name + const char *file; // file name + uint16_t line; // line number + uint8_t what; // description of the call, usage specific + uint8_t arg8; // optional argument, usage specific + uint32_t arg32; // optional argument, usage specific +}; + +#define MK_CALLER(_what, _arg8, _arg32) \ + ({ static const struct ha_caller _ = { \ + .func = __func__, .file = __FILE__, .line = __LINE__, \ + .what = _what, .arg8 = _arg8, .arg32 = _arg32 }; \ + &_; }) + +/* handle 'tainted' status */ +enum tainted_flags { + TAINTED_CONFIG_EXP_KW_DECLARED = 0x00000001, + TAINTED_ACTION_EXP_EXECUTED = 0x00000002, + TAINTED_CLI_EXPERT_MODE = 0x00000004, + TAINTED_CLI_EXPERIMENTAL_MODE = 0x00000008, + TAINTED_WARN = 0x00000010, /* a WARN_ON triggered */ + TAINTED_BUG = 0x00000020, /* a BUG_ON triggered */ + TAINTED_SHARED_LIBS = 0x00000040, /* a shared library was loaded */ + TAINTED_REDEFINITION = 0x00000080, /* symbol redefinition detected */ + TAINTED_REPLACED_MEM_ALLOCATOR = 0x00000100, /* memory allocator was replaced using LD_PRELOAD */ + TAINTED_PANIC = 0x00000200, /* a panic dump has started */ + TAINTED_LUA_STUCK = 0x00000400, /* stuck in a Lua context */ + TAINTED_LUA_STUCK_SHARED = 0x00000800, /* stuck in a shared Lua context */ + TAINTED_MEM_TRIMMING_STUCK = 0x00001000, /* stuck while trimming memory */ +}; + +/* this is a bit field made of TAINTED_*, and is declared in haproxy.c */ +extern unsigned int tainted; + +void complain(int *counter, const char *msg, int taint); + +static inline void mark_tainted(const enum tainted_flags flag) +{ + HA_ATOMIC_OR(&tainted, flag); +} + +static inline unsigned int get_tainted() +{ + return HA_ATOMIC_LOAD(&tainted); +} + +#if defined(DEBUG_MEM_STATS) +#include <stdlib.h> +#include <string.h> + +/* Memory allocation statistics are centralized into a global "mem_stats" + * section. This will not work with some linkers. + */ +enum { + MEM_STATS_TYPE_UNSET = 0, + MEM_STATS_TYPE_CALLOC, + MEM_STATS_TYPE_FREE, + MEM_STATS_TYPE_MALLOC, + MEM_STATS_TYPE_REALLOC, + MEM_STATS_TYPE_STRDUP, + MEM_STATS_TYPE_P_ALLOC, + MEM_STATS_TYPE_P_FREE, +}; + +struct mem_stats { + size_t calls; + size_t size; + struct ha_caller caller; + const void *extra; // extra info specific to this call (e.g. pool ptr) +} __attribute__((aligned(sizeof(void*)))); + +#undef calloc +#define calloc(x,y) ({ \ + size_t __x = (x); size_t __y = (y); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_CALLOC, \ + .func = __func__, \ + }, \ + }; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __x * __y); \ + calloc(__x,__y); \ +}) + +/* note: we can't redefine free() because we have a few variables and struct + * members called like this. This one may be used before a call to free(), + * and when known, the size should be indicated, otherwise pass zero. The + * pointer is used to know whether the call should be accounted for (null is + * ignored). + */ +#undef will_free +#define will_free(x, y) ({ \ + void *__x = (x); size_t __y = (y); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_FREE, \ + .func = __func__, \ + }, \ + }; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + if (__x) { \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __y); \ + } \ +}) + +#undef ha_free +#define ha_free(x) ({ \ + typeof(x) __x = (x); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_FREE, \ + .func = __func__, \ + }, \ + }; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + if (__builtin_constant_p((x)) || __builtin_constant_p(*(x))) { \ + HA_LINK_ERROR(call_to_ha_free_attempts_to_free_a_constant); \ + } \ + if (*__x) \ + _HA_ATOMIC_INC(&_.calls); \ + free(*__x); \ + *__x = NULL; \ +}) + +#undef malloc +#define malloc(x) ({ \ + size_t __x = (x); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_MALLOC, \ + .func = __func__, \ + }, \ + }; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __x); \ + malloc(__x); \ +}) + +#undef realloc +#define realloc(x,y) ({ \ + void *__x = (x); size_t __y = (y); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_REALLOC, \ + .func = __func__, \ + }, \ + }; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __y); \ + realloc(__x,__y); \ +}) + +#undef strdup +#define strdup(x) ({ \ + const char *__x = (x); size_t __y = strlen(__x); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_STRDUP, \ + .func = __func__, \ + }, \ + }; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __y); \ + strdup(__x); \ +}) +#else // DEBUG_MEM_STATS + +#define will_free(x, y) do { } while (0) + +#endif /* DEBUG_MEM_STATS*/ + +/* Add warnings to users of such functions. These will be reported at link time + * indicating what file name and line used them. The goal is to remind their + * users that these are extremely unsafe functions that never have a valid + * reason for being used. + */ +#undef strcat +__attribute__warning("\n" +" * WARNING! strcat() must never be used, because there is no convenient way\n" +" * to use it that is safe. Use memcpy() instead!\n") +extern char *strcat(char *__restrict dest, const char *__restrict src); + +#undef strcpy +__attribute__warning("\n" +" * WARNING! strcpy() must never be used, because there is no convenient way\n" +" * to use it that is safe. Use memcpy() or strlcpy2() instead!\n") +extern char *strcpy(char *__restrict dest, const char *__restrict src); + +#undef strncat +__attribute__warning("\n" +" * WARNING! strncat() must never be used, because there is no convenient way\n" +" * to use it that is safe. Use memcpy() instead!\n") +extern char *strncat(char *__restrict dest, const char *__restrict src, size_t n); + +#undef sprintf +__attribute__warning("\n" +" * WARNING! sprintf() must never be used, because there is no convenient way\n" +" * to use it that is safe. Use snprintf() instead!\n") +extern int sprintf(char *__restrict dest, const char *__restrict fmt, ...); + +#if defined(_VA_LIST_DEFINED) || defined(_VA_LIST_DECLARED) || defined(_VA_LIST) +#undef vsprintf +__attribute__warning("\n" +" * WARNING! vsprintf() must never be used, because there is no convenient way\n" +" * to use it that is safe. Use vsnprintf() instead!\n") +extern int vsprintf(char *__restrict dest, const char *__restrict fmt, va_list ap); +#endif + +#endif /* _HAPROXY_BUG_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/capture-t.h b/include/haproxy/capture-t.h new file mode 100644 index 0000000..ebc7fe8 --- /dev/null +++ b/include/haproxy/capture-t.h @@ -0,0 +1,43 @@ +/* + * include/haproxy/capture-t.h + * This file defines types for captures. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CAPTURE_T_H +#define _HAPROXY_CAPTURE_T_H + +#include <haproxy/pool-t.h> + +struct cap_hdr { + struct cap_hdr *next; + char *name; /* header name, case insensitive, NULL if not header */ + int namelen; /* length of the header name, to speed-up lookups, 0 if !name */ + int len; /* capture length, not including terminal zero */ + int index; /* index in the output array */ + struct pool_head *pool; /* pool of pre-allocated memory area of (len+1) bytes */ +}; + +#endif /* _HAPROXY_CAPTURE_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/capture.h b/include/haproxy/capture.h new file mode 100644 index 0000000..ba0a6c0 --- /dev/null +++ b/include/haproxy/capture.h @@ -0,0 +1,37 @@ +/* + * include/haproxy/capture.h + * This file defines prototypes for captures. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CAPTURE_H +#define _HAPROXY_CAPTURE_H + +#include <haproxy/capture-t.h> +#include <haproxy/pool-t.h> + +extern struct pool_head *pool_head_capture; + +#endif /* _HAPROXY_CAPTURE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/cbuf-t.h b/include/haproxy/cbuf-t.h new file mode 100644 index 0000000..27d3bf1 --- /dev/null +++ b/include/haproxy/cbuf-t.h @@ -0,0 +1,45 @@ +/* + * include/haprox/cbuf-t.h + * This file contains definition for circular buffers. + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CBUF_T_H +#define _HAPROXY_CBUF_T_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif +#endif + +#include <haproxy/list-t.h> + +extern struct pool_head *pool_head_cbuf; + +struct cbuf { + /* buffer */ + unsigned char *buf; + /* buffer size */ + size_t sz; + /* Writer index */ + size_t wr; + /* Reader index */ + size_t rd; +}; + +#endif /* _HAPROXY_CBUF_T_H */ diff --git a/include/haproxy/cbuf.h b/include/haproxy/cbuf.h new file mode 100644 index 0000000..b217a5c --- /dev/null +++ b/include/haproxy/cbuf.h @@ -0,0 +1,136 @@ +/* + * include/haprox/cbuf.h + * This file contains definitions and prototypes for circular buffers. + * Inspired from Linux circular buffers (include/linux/circ_buf.h). + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CBUF_H +#define _HAPROXY_CBUF_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif +#endif + +#include <haproxy/atomic.h> +#include <haproxy/list.h> +#include <haproxy/cbuf-t.h> + +struct cbuf *cbuf_new(unsigned char *buf, size_t sz); +void cbuf_free(struct cbuf *cbuf); + +/* Amount of data between <rd> and <wr> */ +#define CBUF_DATA(wr, rd, size) (((wr) - (rd)) & ((size) - 1)) + +/* Return the writer position in <cbuf>. + * To be used only by the writer! + */ +static inline unsigned char *cb_wr(struct cbuf *cbuf) +{ + return cbuf->buf + cbuf->wr; +} + +/* Reset the reader index. + * To be used by a reader! + */ +static inline void cb_rd_reset(struct cbuf *cbuf) +{ + cbuf->rd = 0; +} + +/* Reset the writer index. + * To be used by a writer! + */ +static inline void cb_wr_reset(struct cbuf *cbuf) +{ + cbuf->wr = 0; +} + +/* Increase <cbuf> circular buffer data by <count>. + * To be used by a writer! + */ +static inline void cb_add(struct cbuf *cbuf, size_t count) +{ + cbuf->wr = (cbuf->wr + count) & (cbuf->sz - 1); +} + +/* Return the reader position in <cbuf>. + * To be used only by the reader! + */ +static inline unsigned char *cb_rd(struct cbuf *cbuf) +{ + return cbuf->buf + cbuf->rd; +} + +/* Skip <count> byte in <cbuf> circular buffer. + * To be used by a reader! + */ +static inline void cb_del(struct cbuf *cbuf, size_t count) +{ + cbuf->rd = (cbuf->rd + count) & (cbuf->sz - 1); +} + +/* Return the amount of data left in <cbuf>. + * To be used only by the writer! + */ +static inline size_t cb_data(struct cbuf *cbuf) +{ + size_t rd; + + rd = HA_ATOMIC_LOAD(&cbuf->rd); + return CBUF_DATA(cbuf->wr, rd, cbuf->sz); +} + +/* Return the amount of room left in <cbuf> minus 1 to distinguish + * the case where the buffer is full from the case where is is empty + * To be used only by the write! + */ +static inline size_t cb_room(struct cbuf *cbuf) +{ + size_t rd; + + rd = HA_ATOMIC_LOAD(&cbuf->rd); + return CBUF_DATA(rd, cbuf->wr + 1, cbuf->sz); +} + +/* Return the amount of contiguous data left in <cbuf>. + * To be used only by the reader! + */ +static inline size_t cb_contig_data(struct cbuf *cbuf) +{ + size_t end, n; + + end = cbuf->sz - cbuf->rd; + n = (HA_ATOMIC_LOAD(&cbuf->wr) + end) & (cbuf->sz - 1); + return n < end ? n : end; +} + +/* Return the amount of contiguous space left in <cbuf>. + * To be used only by the writer! + */ +static inline size_t cb_contig_space(struct cbuf *cbuf) +{ + size_t end, n; + + end = cbuf->sz - 1 - cbuf->wr; + n = (HA_ATOMIC_LOAD(&cbuf->rd) + end) & (cbuf->sz - 1); + return n <= end ? n : end + 1; +} + +#endif /* _HAPROXY_CBUF_H */ diff --git a/include/haproxy/cfgcond-t.h b/include/haproxy/cfgcond-t.h new file mode 100644 index 0000000..00fc126 --- /dev/null +++ b/include/haproxy/cfgcond-t.h @@ -0,0 +1,105 @@ +/* + * include/haproxy/cfgcond-t.h + * Types for the configuration condition preprocessor + * + * Copyright (C) 2000-2021 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CFGCOND_T_H +#define _HAPROXY_CFGCOND_T_H + +#include <haproxy/api-t.h> + +/* nested if/elif/else/endif block states */ +enum nested_cond_state { + NESTED_COND_IF_TAKE, // "if" with a true condition + NESTED_COND_IF_DROP, // "if" with a false condition + NESTED_COND_IF_SKIP, // "if" masked by an outer false condition + + NESTED_COND_ELIF_TAKE, // "elif" with a true condition from a false one + NESTED_COND_ELIF_DROP, // "elif" with a false condition from a false one + NESTED_COND_ELIF_SKIP, // "elif" masked by an outer false condition or a previously taken if + + NESTED_COND_ELSE_TAKE, // taken "else" after an if false condition + NESTED_COND_ELSE_DROP, // "else" masked by outer false condition or an if true condition +}; + +/* 100 levels of nested conditions should already be sufficient */ +#define MAXNESTEDCONDS 100 + +/* supported conditional predicates for .if/.elif */ +enum cond_predicate { + CFG_PRED_NONE, // none + CFG_PRED_DEFINED, // "defined" + CFG_PRED_FEATURE, // "feature" + CFG_PRED_STREQ, // "streq" + CFG_PRED_STRNEQ, // "strneq" + CFG_PRED_STRSTR, // "strstr" + CFG_PRED_VERSION_ATLEAST, // "version_atleast" + CFG_PRED_VERSION_BEFORE, // "version_before" + CFG_PRED_OSSL_VERSION_ATLEAST, // "openssl_version_atleast" + CFG_PRED_OSSL_VERSION_BEFORE, // "openssl_version_before" + CFG_PRED_SSLLIB_NAME_STARTSWITH, // "ssllib_name_startswith" + CFG_PRED_ENABLED, // "enabled" +}; + +/* types for condition terms */ +enum cfg_cond_term_type { + CCTT_NONE = 0, + CCTT_FALSE, + CCTT_TRUE, + CCTT_PRED, + CCTT_PAREN, // '(' EXPR ')' +}; + +/* keyword for a condition predicate */ +struct cond_pred_kw { + const char *word; // NULL marks the end of the list + enum cond_predicate prd; // one of the CFG_PRED_* above + uint64_t arg_mask; // mask of supported arguments (strings only) +}; + +/* condition term */ +struct cfg_cond_term { + enum cfg_cond_term_type type; // CCTT_* + struct arg *args; // arguments for predicates + int neg; // 0: direct result; 1: negate + union { + const struct cond_pred_kw *pred; // predicate (function) + struct cfg_cond_expr *expr; // expression for CCTT_PAREN + }; +}; + +/* condition sub-expression for an AND: + * expr_and = <term> '&&' <expr_and> + * | <term> + */ +struct cfg_cond_and { + struct cfg_cond_term *left; + struct cfg_cond_and *right; // may be NULL +}; + +/* condition expression: + * expr = <expr_and> '||' <expr> + * | <expr_and> + */ +struct cfg_cond_expr { + struct cfg_cond_and *left; + struct cfg_cond_expr *right; // may be NULL +}; + +#endif /* _HAPROXY_CFGCOND_T_H */ diff --git a/include/haproxy/cfgcond.h b/include/haproxy/cfgcond.h new file mode 100644 index 0000000..3171f81 --- /dev/null +++ b/include/haproxy/cfgcond.h @@ -0,0 +1,43 @@ +/* + * include/haproxy/cfgcond.h + * Configuration condition preprocessor + * + * Copyright (C) 2000-2021 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CFGCOND_H +#define _HAPROXY_CFGCOND_H + +#include <haproxy/api.h> +#include <haproxy/cfgcond-t.h> + +const struct cond_pred_kw *cfg_lookup_cond_pred(const char *str); +int cfg_parse_cond_term(const char **text, struct cfg_cond_term **term, char **err, const char **errptr, int maxdepth); +int cfg_eval_cond_term(const struct cfg_cond_term *term, char **err); +void cfg_free_cond_term(struct cfg_cond_term *term); + +int cfg_parse_cond_and(const char **text, struct cfg_cond_and **expr, char **err, const char **errptr, int maxdepth); +int cfg_eval_cond_and(struct cfg_cond_and *expr, char **err); +void cfg_free_cond_and(struct cfg_cond_and *expr); + +int cfg_parse_cond_expr(const char **text, struct cfg_cond_expr **expr, char **err, const char **errptr, int maxdepth); +int cfg_eval_cond_expr(struct cfg_cond_expr *expr, char **err); +void cfg_free_cond_expr(struct cfg_cond_expr *expr); + +int cfg_eval_condition(char **args, char **err, const char **errptr); + +#endif diff --git a/include/haproxy/cfgdiag.h b/include/haproxy/cfgdiag.h new file mode 100644 index 0000000..6989109 --- /dev/null +++ b/include/haproxy/cfgdiag.h @@ -0,0 +1,11 @@ +#ifndef _HAPROXY_CFGDIAG_H +#define _HAPROXY_CFGDIAG_H + +/* Placeholder to execute various diagnostic checks after the configuration file + * has been fully parsed. It will output a warning for each diagnostic found. + * + * Returns 0 if no diagnostic message has been found else 1. + */ +int cfg_run_diagnostics(void); + +#endif /* _HAPROXY_CFGDIAG_H */ diff --git a/include/haproxy/cfgparse.h b/include/haproxy/cfgparse.h new file mode 100644 index 0000000..adcabb3 --- /dev/null +++ b/include/haproxy/cfgparse.h @@ -0,0 +1,149 @@ +/* + * include/haproxy/cfgparse.h + * Configuration parsing functions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CFGPARSE_H +#define _HAPROXY_CFGPARSE_H + +#include <haproxy/api.h> + +struct hap_cpuset; +struct proxy; +struct bind_conf; +struct acl_cond; + +/* configuration sections */ +#define CFG_NONE 0 +#define CFG_GLOBAL 1 +#define CFG_LISTEN 2 +#define CFG_USERLIST 3 +#define CFG_PEERS 4 +#define CFG_CRTLIST 5 + +/* various keyword modifiers */ +enum kw_mod { + KWM_STD = 0, /* normal */ + KWM_NO, /* "no" prefixed before the keyword */ + KWM_DEF, /* "default" prefixed before the keyword */ +}; + +enum cfg_keyword_flags { + KWF_EXPERIMENTAL = 0x1, + KWF_MATCH_PREFIX = 0x2, +}; + +struct cfg_keyword { + int section; /* section type for this keyword */ + const char *kw; /* the keyword itself */ + int (*parse)( /* 0=OK, <0=Alert, >0=Warning */ + char **args, /* command line and arguments */ + int section_type, /* current section CFG_{GLOBAL|LISTEN} */ + struct proxy *curpx, /* current proxy (NULL in GLOBAL) */ + const struct proxy *defpx, /* default proxy (NULL in GLOBAL) */ + const char *file, /* config file name */ + int line, /* config file line number */ + char **err); /* error or warning message output pointer */ + int flags; +}; + +/* A keyword list. It is a NULL-terminated array of keywords. It embeds a + * struct list in order to be linked to other lists, allowing it to easily + * be declared where it is needed, and linked without duplicating data nor + * allocating memory. + */ +struct cfg_kw_list { + struct list list; + struct cfg_keyword kw[VAR_ARRAY]; +}; + +/* permit to store configuration section */ +struct cfg_section { + struct list list; + char *section_name; + int (*section_parser)(const char *, int, char **, int); + int (*post_section_parser)(); +}; + +/* store post configuration parsing */ + +struct cfg_postparser { + struct list list; + char *name; + int (*func)(); +}; + +extern struct list postparsers; +extern int cfg_maxpconn; +extern int cfg_maxconn; +extern char *cfg_scope; +extern struct cfg_kw_list cfg_keywords; +extern char *cursection; +extern int non_global_section_parsed; + +extern struct proxy *curproxy; + +int cfg_parse_global(const char *file, int linenum, char **args, int inv); +int cfg_parse_listen(const char *file, int linenum, char **args, int inv); +int cfg_parse_track_sc_num(unsigned int *track_sc_num, + const char *arg, const char *end, char **err); +int readcfgfile(const char *file); +void cfg_register_keywords(struct cfg_kw_list *kwl); +void cfg_unregister_keywords(struct cfg_kw_list *kwl); +int check_config_validity(void); +int str2listener(char *str, struct proxy *curproxy, struct bind_conf *bind_conf, const char *file, int line, char **err); +int str2receiver(char *str, struct proxy *curproxy, struct bind_conf *bind_conf, const char *file, int line, char **err); +int cfg_register_section(char *section_name, + int (*section_parser)(const char *, int, char **, int), + int (*post_section_parser)()); +int cfg_register_postparser(char *name, int (*func)()); +void cfg_unregister_sections(void); +void cfg_backup_sections(struct list *backup_sections); +void cfg_restore_sections(struct list *backup_sections); +int warnif_misplaced_tcp_conn(struct proxy *proxy, const char *file, int line, const char *arg); +int warnif_misplaced_tcp_sess(struct proxy *proxy, const char *file, int line, const char *arg); +int warnif_misplaced_tcp_cont(struct proxy *proxy, const char *file, int line, const char *arg); +int warnif_cond_conflicts(const struct acl_cond *cond, unsigned int where, const char *file, int line); +int warnif_tcp_http_cond(const struct proxy *px, const struct acl_cond *cond); +int too_many_args_idx(int maxarg, int index, char **args, char **msg, int *err_code); +int too_many_args(int maxarg, char **args, char **msg, int *err_code); +int alertif_too_many_args_idx(int maxarg, int index, const char *file, int linenum, char **args, int *err_code); +int alertif_too_many_args(int maxarg, const char *file, int linenum, char **args, int *err_code); +int parse_process_number(const char *arg, unsigned long *proc, int max, int *autoinc, char **err); +void free_email_alert(struct proxy *p); +const char *cfg_find_best_match(const char *word, const struct list *list, int section, const char **extra); +int warnifnotcap(struct proxy *proxy, int cap, const char *file, int line, const char *arg, const char *hint); +int failifnotcap(struct proxy *proxy, int cap, const char *file, int line, const char *arg, const char *hint); +void cfg_dump_registered_keywords(); + +/* simplified way to define a section parser */ +#define REGISTER_CONFIG_SECTION(name, parse, post) \ + INITCALL3(STG_REGISTER, cfg_register_section, (name), (parse), (post)) + +#define REGISTER_CONFIG_POSTPARSER(name, parser) \ + INITCALL2(STG_REGISTER, cfg_register_postparser, (name), (parser)) + +#endif /* _HAPROXY_CFGPARSE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/channel-t.h b/include/haproxy/channel-t.h new file mode 100644 index 0000000..6972edb --- /dev/null +++ b/include/haproxy/channel-t.h @@ -0,0 +1,314 @@ +/* + * include/haproxy/channel-t.h + * Channel management definitions, macros and inline functions. + * + * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CHANNEL_T_H +#define _HAPROXY_CHANNEL_T_H + +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/show_flags-t.h> + +/* The CF_* macros designate Channel Flags, which may be ORed in the bit field + * member 'flags' in struct channel. Here we have several types of flags : + * + * - pure status flags, reported by the data layer, which must be cleared + * before doing further I/O : + * CF_*_EVENT, CF_*_PARTIAL + * + * - pure status flags, reported by stream connector layer, which must also + * be cleared before doing further I/O : + * CF_*_TIMEOUT + * + * - read-only indicators reported by lower data levels : + * CF_STREAMER, CF_STREAMER_FAST + * + * The flags have been arranged for readability, so that the read and write + * bits have the same position in a byte (read being the lower byte and write + * the second one). All flag names are relative to the channel. For instance, + * 'write' indicates the direction from the channel to the stream connector. + * Please also update the chn_show_flags() function below in case of changes. + */ + +#define CF_READ_EVENT 0x00000001 /* a read event detected on producer side */ +/* unused: 0x00000002 */ +#define CF_READ_TIMEOUT 0x00000004 /* timeout while waiting for producer */ +/* unused 0x00000008 */ + +/* unused: 0x00000010 - 0x00000080 */ + +#define CF_WRITE_EVENT 0x00000100 /* a write event detected on consumer side */ +/* unused: 0x00000200 */ +#define CF_WRITE_TIMEOUT 0x00000400 /* timeout while waiting for consumer */ +/* unused 0x00000800 */ + +#define CF_WAKE_WRITE 0x00001000 /* wake the task up when there's write activity */ +/* unused: 0x00002000 - 0x00004000 */ +#define CF_AUTO_CLOSE 0x00008000 /* producer can forward shutdown to other side */ + +#define CF_STREAMER 0x00010000 /* the producer is identified as streaming data */ +#define CF_STREAMER_FAST 0x00020000 /* the consumer seems to eat the stream very fast */ + +#define CF_WROTE_DATA 0x00040000 /* some data were sent from this buffer */ +/* unused 0x00080000 - 0x00400000 */ +#define CF_AUTO_CONNECT 0x00800000 /* consumer may attempt to establish a new connection */ + +#define CF_DONT_READ 0x01000000 /* disable reading for now */ +/* unused 0x02000000 - 0x08000000 */ + +#define CF_WAKE_ONCE 0x10000000 /* pretend there is activity on this channel (one-shoot) */ +#define CF_FLT_ANALYZE 0x20000000 /* at least one filter is still analyzing this channel */ +/* unuse 0x40000000 */ +#define CF_ISRESP 0x80000000 /* 0 = request channel, 1 = response channel */ + +/* Masks which define input events for stream analysers */ +#define CF_MASK_ANALYSER (CF_READ_EVENT|CF_READ_TIMEOUT|CF_WRITE_EVENT|CF_WAKE_ONCE) + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *chn_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(CF_READ_EVENT, _(CF_READ_TIMEOUT, + _(CF_WRITE_EVENT, + _(CF_WRITE_TIMEOUT, + _(CF_WAKE_WRITE, _(CF_AUTO_CLOSE, + _(CF_STREAMER, _(CF_STREAMER_FAST, _(CF_WROTE_DATA, + _(CF_AUTO_CONNECT, _(CF_DONT_READ, + _(CF_WAKE_ONCE, _(CF_FLT_ANALYZE, + _(CF_ISRESP)))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* Analysers (channel->analysers). + * Those bits indicate that there are some processing to do on the buffer + * contents. It will probably evolve into a linked list later. Those + * analysers could be compared to higher level processors. + * The field is blanked by channel_init() and only by analysers themselves + * afterwards. + * Please also update the chn_show_analysers() function below in case of changes. + */ +/* AN_REQ_FLT_START_FE: 0x00000001 */ +#define AN_REQ_INSPECT_FE 0x00000002 /* inspect request contents in the frontend */ +#define AN_REQ_WAIT_HTTP 0x00000004 /* wait for an HTTP request */ +#define AN_REQ_HTTP_BODY 0x00000008 /* wait for HTTP request body */ +#define AN_REQ_HTTP_PROCESS_FE 0x00000010 /* process the frontend's HTTP part */ +#define AN_REQ_SWITCHING_RULES 0x00000020 /* apply the switching rules */ +/* AN_REQ_FLT_START_BE: 0x00000040 */ +#define AN_REQ_INSPECT_BE 0x00000080 /* inspect request contents in the backend */ +#define AN_REQ_HTTP_PROCESS_BE 0x00000100 /* process the backend's HTTP part */ +#define AN_REQ_HTTP_TARPIT 0x00000200 /* wait for end of HTTP tarpit */ +#define AN_REQ_SRV_RULES 0x00000400 /* use-server rules */ +#define AN_REQ_HTTP_INNER 0x00000800 /* inner processing of HTTP request */ +#define AN_REQ_PRST_RDP_COOKIE 0x00001000 /* persistence on rdp cookie */ +#define AN_REQ_STICKING_RULES 0x00002000 /* table persistence matching */ +/* AN_REQ_FLT_HTTP_HDRS: 0x00004000 */ +#define AN_REQ_HTTP_XFER_BODY 0x00008000 /* forward request body */ +#define AN_REQ_WAIT_CLI 0x00010000 +/* AN_REQ_FLT_XFER_DATA: 0x00020000 */ +/* AN_REQ_FLT_END: 0x00040000 */ +#define AN_REQ_ALL 0x0001bfbe /* all of the request analysers */ + +/* response analysers */ +/* AN_RES_FLT_START_FE: 0x00080000 */ +/* AN_RES_FLT_START_BE: 0x00100000 */ +#define AN_RES_INSPECT 0x00200000 /* content inspection */ +#define AN_RES_WAIT_HTTP 0x00400000 /* wait for HTTP response */ +#define AN_RES_STORE_RULES 0x00800000 /* table persistence matching */ +#define AN_RES_HTTP_PROCESS_BE 0x01000000 /* process backend's HTTP part */ +#define AN_RES_HTTP_PROCESS_FE 0x01000000 /* process frontend's HTTP part (same for now) */ +/* AN_RES_FLT_HTTP_HDRS: 0x02000000 */ +#define AN_RES_HTTP_XFER_BODY 0x04000000 /* forward response body */ +#define AN_RES_WAIT_CLI 0x08000000 +/* AN_RES_FLT_XFER_DATA: 0x10000000 */ +/* AN_RES_FLT_END: 0x20000000 */ +#define AN_RES_ALL 0x0de00000 /* all of the response analysers */ + +/* filters interleaved with analysers, see above */ +#define AN_REQ_FLT_START_FE 0x00000001 +#define AN_REQ_FLT_START_BE 0x00000040 +#define AN_REQ_FLT_HTTP_HDRS 0x00004000 +#define AN_REQ_FLT_XFER_DATA 0x00020000 +#define AN_REQ_FLT_END 0x00040000 + +#define AN_RES_FLT_START_FE 0x00080000 +#define AN_RES_FLT_START_BE 0x00100000 +#define AN_RES_FLT_HTTP_HDRS 0x02000000 +#define AN_RES_FLT_XFER_DATA 0x10000000 +#define AN_RES_FLT_END 0x20000000 + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *chn_show_analysers(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* request flags */ + _(AN_REQ_FLT_START_FE, _(AN_REQ_INSPECT_FE, _(AN_REQ_WAIT_HTTP, + _(AN_REQ_HTTP_BODY, _(AN_REQ_HTTP_PROCESS_FE, _(AN_REQ_SWITCHING_RULES, + _(AN_REQ_FLT_START_BE, _(AN_REQ_INSPECT_BE, _(AN_REQ_HTTP_PROCESS_BE, + _(AN_REQ_HTTP_TARPIT, _(AN_REQ_SRV_RULES, _(AN_REQ_HTTP_INNER, + _(AN_REQ_PRST_RDP_COOKIE, _(AN_REQ_STICKING_RULES, + _(AN_REQ_FLT_HTTP_HDRS, _(AN_REQ_HTTP_XFER_BODY, _(AN_REQ_WAIT_CLI, + _(AN_REQ_FLT_XFER_DATA, _(AN_REQ_FLT_END, + /* response flags */ + _(AN_RES_FLT_START_FE, _(AN_RES_FLT_START_BE, _(AN_RES_INSPECT, + _(AN_RES_WAIT_HTTP, _(AN_RES_STORE_RULES, _(AN_RES_HTTP_PROCESS_FE, + _(AN_RES_HTTP_PROCESS_BE, _(AN_RES_FLT_HTTP_HDRS, + _(AN_RES_HTTP_XFER_BODY, _(AN_RES_WAIT_CLI, _(AN_RES_FLT_XFER_DATA, + _(AN_RES_FLT_END))))))))))))))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* Magic value to forward infinite size (TCP, ...), used with ->to_forward */ +#define CHN_INFINITE_FORWARD MAX_RANGE(unsigned int) + + +struct channel { + unsigned int flags; /* CF_* */ + unsigned int analysers; /* bit field indicating what to do on the channel */ + struct buffer buf; /* buffer attached to the channel, always present but may move */ + size_t output; /* part of buffer which is to be forwarded */ + unsigned int to_forward; /* number of bytes to forward after out without a wake-up */ + unsigned short last_read; /* 16 lower bits of last read date (max pause=65s) */ + unsigned char xfer_large; /* number of consecutive large xfers */ + unsigned char xfer_small; /* number of consecutive small xfers */ + unsigned long long total; /* total data read */ + int analyse_exp; /* expiration date for current analysers (if set) */ +}; + + +/* Note about the channel structure + * + * A channel stores information needed to reliably transport data in a single + * direction. It stores status flags, timeouts, counters, subscribed analysers, + * pointers to a data producer and to a data consumer, and information about + * the amount of data which is allowed to flow directly from the producer to + * the consumer without waking up the analysers. + * + * A channel may buffer data into two locations : + * - a visible buffer (->buf) + * - an invisible buffer which right now consists in a pipe making use of + * kernel buffers that cannot be tampered with. + * + * Data stored into the first location may be analysed and altered by analysers + * while data stored in pipes is only aimed at being transported from one + * network socket to another one without being subject to memory copies. This + * buffer may only be used when both the socket layer and the data layer of the + * producer and the consumer support it, which typically is the case with Linux + * splicing over sockets, and when there are enough data to be transported + * without being analyzed (transport of TCP/HTTP payload or tunnelled data, + * which is indicated by ->to_forward). + * + * In order not to mix data streams, the producer may only feed the invisible + * data with data to forward, and only when the visible buffer is empty. The + * producer may not always be able to feed the invisible buffer due to platform + * limitations (lack of kernel support). + * + * Conversely, the consumer must always take data from the invisible data first + * before ever considering visible data. There is no limit to the size of data + * to consume from the invisible buffer, as platform-specific implementations + * will rarely leave enough control on this. So any byte fed into the invisible + * buffer is expected to reach the destination file descriptor, by any means. + * However, it's the consumer's responsibility to ensure that the invisible + * data has been entirely consumed before consuming visible data. This must be + * reflected by ->pipe->data. This is very important as this and only this can + * ensure strict ordering of data between buffers. + * + * The producer is responsible for decreasing ->to_forward. The ->to_forward + * parameter indicates how many bytes may be fed into either data buffer + * without waking the parent up. The special value CHN_INFINITE_FORWARD is + * never decreased nor increased. + * + * The buf->o parameter says how many bytes may be consumed from the visible + * buffer. This parameter is updated by any buffer_write() as well as any data + * forwarded through the visible buffer. Since the ->to_forward attribute + * applies to data after buf->p, an analyser will not see a buffer which has a + * non-null ->to_forward with buf->i > 0. A producer is responsible for raising + * buf->o by min(to_forward, buf->i) when it injects data into the buffer. + * + * The consumer is responsible for decreasing ->buf->o when it sends data + * from the visible buffer, and ->pipe->data when it sends data from the + * invisible buffer. + * + * A real-world example consists in part in an HTTP response waiting in a + * buffer to be forwarded. We know the header length (300) and the amount of + * data to forward (content-length=9000). The buffer already contains 1000 + * bytes of data after the 300 bytes of headers. Thus the caller will set + * buf->o to 300 indicating that it explicitly wants to send those data, and + * set ->to_forward to 9000 (content-length). This value must be normalised + * immediately after updating ->to_forward : since there are already 1300 bytes + * in the buffer, 300 of which are already counted in buf->o, and that size + * is smaller than ->to_forward, we must update buf->o to 1300 to flush the + * whole buffer, and reduce ->to_forward to 8000. After that, the producer may + * try to feed the additional data through the invisible buffer using a + * platform-specific method such as splice(). + * + * The ->to_forward entry is also used to detect whether we can fill the buffer + * or not. The idea is that we need to save some space for data manipulation + * (mainly header rewriting in HTTP) so we don't want to have a full buffer on + * input before processing a request or response. Thus, we ensure that there is + * always global.maxrewrite bytes of free space. Since we don't want to forward + * chunks without filling the buffer, we rely on ->to_forward. When ->to_forward + * is null, we may have some processing to do so we don't want to fill the + * buffer. When ->to_forward is non-null, we know we don't care for at least as + * many bytes. In the end, we know that each of the ->to_forward bytes will + * eventually leave the buffer. So as long as ->to_forward is larger than + * global.maxrewrite, we can fill the buffer. If ->to_forward is smaller than + * global.maxrewrite, then we don't want to fill the buffer with more than + * buf->size - global.maxrewrite + ->to_forward. + * + * A buffer may contain up to 5 areas : + * - the data waiting to be sent. These data are located between buf->p-o and + * buf->p ; + * - the data to process and possibly transform. These data start at + * buf->p and may be up to ->i bytes long. + * - the data to preserve. They start at ->p and stop at ->p+i. The limit + * between the two solely depends on the protocol being analysed. + * - the spare area : it is the remainder of the buffer, which can be used to + * store new incoming data. It starts at ->p+i and is up to ->size-i-o long. + * It may be limited by global.maxrewrite. + * - the reserved area : this is the area which must not be filled and is + * reserved for possible rewrites ; it is up to global.maxrewrite bytes + * long. + */ + +#endif /* _HAPROXY_CHANNEL_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/channel.h b/include/haproxy/channel.h new file mode 100644 index 0000000..17dd75f --- /dev/null +++ b/include/haproxy/channel.h @@ -0,0 +1,1021 @@ +/* + * include/haproxy/channel.h + * Channel management definitions, macros and inline functions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CHANNEL_H +#define _HAPROXY_CHANNEL_H + +#include <haproxy/api.h> +#include <haproxy/channel-t.h> +#include <haproxy/dynbuf.h> +#include <haproxy/global.h> +#include <haproxy/htx.h> +#include <haproxy/stream.h> +#include <haproxy/task.h> +#include <haproxy/ticks.h> +#include <haproxy/tools-t.h> + +struct stconn; + +/* perform minimal initializations, report 0 in case of error, 1 if OK. */ +int init_channel(); + +unsigned long long __channel_forward(struct channel *chn, unsigned long long bytes); + +/* SI-to-channel functions working with buffers */ +int ci_putblk(struct channel *chn, const char *str, int len); +int ci_putchr(struct channel *chn, char c); +int ci_getline_nc(const struct channel *chn, char **blk1, size_t *len1, char **blk2, size_t *len2); +int ci_getblk_nc(const struct channel *chn, char **blk1, size_t *len1, char **blk2, size_t *len2); +int ci_insert_line2(struct channel *c, int pos, const char *str, int len); +int co_inject(struct channel *chn, const char *msg, int len); +int co_getchar(const struct channel *chn, char *c); +int co_getline(const struct channel *chn, char *str, int len); +int co_getdelim(const struct channel *chn, char *str, int len, const char *delim, char escape); +int co_getword(const struct channel *chn, char *str, int len, char sep); +int co_getblk(const struct channel *chn, char *blk, int len, int offset); +int co_getline_nc(const struct channel *chn, const char **blk1, size_t *len1, const char **blk2, size_t *len2); +int co_getblk_nc(const struct channel *chn, const char **blk1, size_t *len1, const char **blk2, size_t *len2); + + +/* returns a pointer to the stream the channel belongs to */ +static inline struct stream *chn_strm(const struct channel *chn) +{ + if (chn->flags & CF_ISRESP) + return LIST_ELEM(chn, struct stream *, res); + else + return LIST_ELEM(chn, struct stream *, req); +} + +/* returns a pointer to the stream connector feeding the channel (producer) */ +static inline struct stconn *chn_prod(const struct channel *chn) +{ + if (chn->flags & CF_ISRESP) + return LIST_ELEM(chn, struct stream *, res)->scb; + else + return LIST_ELEM(chn, struct stream *, req)->scf; +} + +/* returns a pointer to the stream connector consuming the channel (producer) */ +static inline struct stconn *chn_cons(const struct channel *chn) +{ + if (chn->flags & CF_ISRESP) + return LIST_ELEM(chn, struct stream *, res)->scf; + else + return LIST_ELEM(chn, struct stream *, req)->scb; +} + +/* c_orig() : returns the pointer to the channel buffer's origin */ +static inline char *c_orig(const struct channel *c) +{ + return b_orig(&c->buf); +} + +/* c_size() : returns the size of the channel's buffer */ +static inline size_t c_size(const struct channel *c) +{ + return b_size(&c->buf); +} + +/* c_wrap() : returns the pointer to the channel buffer's wrapping point */ +static inline char *c_wrap(const struct channel *c) +{ + return b_wrap(&c->buf); +} + +/* c_data() : returns the amount of data in the channel's buffer */ +static inline size_t c_data(const struct channel *c) +{ + return b_data(&c->buf); +} + +/* c_room() : returns the room left in the channel's buffer */ +static inline size_t c_room(const struct channel *c) +{ + return b_size(&c->buf) - b_data(&c->buf); +} + +/* c_empty() : returns a boolean indicating if the channel's buffer is empty */ +static inline size_t c_empty(const struct channel *c) +{ + return !c_data(c); +} + +/* c_full() : returns a boolean indicating if the channel's buffer is full */ +static inline size_t c_full(const struct channel *c) +{ + return !c_room(c); +} + +/* co_data() : returns the amount of output data in the channel's buffer */ +static inline size_t co_data(const struct channel *c) +{ + CHECK_IF_HOT(c->output > c_data(c)); + return c->output; +} + +/* ci_data() : returns the amount of input data in the channel's buffer */ +static inline size_t ci_data(const struct channel *c) +{ + return c_data(c) - co_data(c); +} + +/* ci_next() : for an absolute pointer <p> or a relative offset <o> pointing to + * a valid location within channel <c>'s buffer, returns either the absolute + * pointer or the relative offset pointing to the next byte, which usually is + * at (p + 1) unless p reaches the wrapping point and wrapping is needed. + */ +static inline size_t ci_next_ofs(const struct channel *c, size_t o) +{ + return b_next_ofs(&c->buf, o); +} +static inline char *ci_next(const struct channel *c, const char *p) +{ + return b_next(&c->buf, p); +} + + +/* c_ptr() : returns a pointer to an offset relative to the beginning of the + * input data in the buffer. If instead the offset is negative, a pointer to + * existing output data is returned. The function only takes care of wrapping, + * it's up to the caller to ensure the offset is always within byte count + * bounds. + */ +static inline char *c_ptr(const struct channel *c, ssize_t ofs) +{ + return b_peek(&c->buf, co_data(c) + ofs); +} + +/* c_adv() : advances the channel's buffer by <adv> bytes, which means that the + * buffer's pointer advances, and that as many bytes from in are transferred + * from in to out. The caller is responsible for ensuring that adv is always + * smaller than or equal to b->i. + */ +static inline void c_adv(struct channel *c, size_t adv) +{ + c->output += adv; + BUG_ON_HOT(c->output > c_data(c)); +} + +/* c_rew() : rewinds the channel's buffer by <adv> bytes, which means that the + * buffer's pointer goes backwards, and that as many bytes from out are moved + * to in. The caller is responsible for ensuring that adv is always smaller + * than or equal to b->o. + */ +static inline void c_rew(struct channel *c, size_t adv) +{ + BUG_ON_HOT(c->output < adv); + c->output -= adv; +} + +/* c_realign_if_empty() : realign the channel's buffer if it's empty */ +static inline void c_realign_if_empty(struct channel *chn) +{ + b_realign_if_empty(&chn->buf); +} + +/* Sets the amount of output for the channel */ +static inline void co_set_data(struct channel *c, size_t output) +{ + BUG_ON_HOT(output > c_data(c)); + c->output = output; +} + + +/* co_head() : returns a pointer to the beginning of output data in the buffer. + * The "__" variants don't support wrapping, "ofs" are relative to + * the buffer's origin. + */ +static inline size_t __co_head_ofs(const struct channel *c) +{ + return __b_peek_ofs(&c->buf, 0); +} +static inline char *__co_head(const struct channel *c) +{ + return __b_peek(&c->buf, 0); +} +static inline size_t co_head_ofs(const struct channel *c) +{ + return b_peek_ofs(&c->buf, 0); +} +static inline char *co_head(const struct channel *c) +{ + return b_peek(&c->buf, 0); +} + + +/* co_tail() : returns a pointer to the end of output data in the buffer. + * The "__" variants don't support wrapping, "ofs" are relative to + * the buffer's origin. + */ +static inline size_t __co_tail_ofs(const struct channel *c) +{ + return __b_peek_ofs(&c->buf, co_data(c)); +} +static inline char *__co_tail(const struct channel *c) +{ + return __b_peek(&c->buf, co_data(c)); +} +static inline size_t co_tail_ofs(const struct channel *c) +{ + return b_peek_ofs(&c->buf, co_data(c)); +} +static inline char *co_tail(const struct channel *c) +{ + return b_peek(&c->buf, co_data(c)); +} + + +/* ci_head() : returns a pointer to the beginning of input data in the buffer. + * The "__" variants don't support wrapping, "ofs" are relative to + * the buffer's origin. + */ +static inline size_t __ci_head_ofs(const struct channel *c) +{ + return __b_peek_ofs(&c->buf, co_data(c)); +} +static inline char *__ci_head(const struct channel *c) +{ + return __b_peek(&c->buf, co_data(c)); +} +static inline size_t ci_head_ofs(const struct channel *c) +{ + return b_peek_ofs(&c->buf, co_data(c)); +} +static inline char *ci_head(const struct channel *c) +{ + return b_peek(&c->buf, co_data(c)); +} + + +/* ci_tail() : returns a pointer to the end of input data in the buffer. + * The "__" variants don't support wrapping, "ofs" are relative to + * the buffer's origin. + */ +static inline size_t __ci_tail_ofs(const struct channel *c) +{ + return __b_peek_ofs(&c->buf, c_data(c)); +} +static inline char *__ci_tail(const struct channel *c) +{ + return __b_peek(&c->buf, c_data(c)); +} +static inline size_t ci_tail_ofs(const struct channel *c) +{ + return b_peek_ofs(&c->buf, c_data(c)); +} +static inline char *ci_tail(const struct channel *c) +{ + return b_peek(&c->buf, c_data(c)); +} + + +/* ci_stop() : returns the pointer to the byte following the end of input data + * in the channel buffer. It may be out of the buffer. It's used to + * compute lengths or stop pointers. + */ +static inline size_t __ci_stop_ofs(const struct channel *c) +{ + return __b_stop_ofs(&c->buf); +} +static inline const char *__ci_stop(const struct channel *c) +{ + return __b_stop(&c->buf); +} +static inline size_t ci_stop_ofs(const struct channel *c) +{ + return b_stop_ofs(&c->buf); +} +static inline const char *ci_stop(const struct channel *c) +{ + return b_stop(&c->buf); +} + + +/* Returns the amount of input data that can contiguously be read at once */ +static inline size_t ci_contig_data(const struct channel *c) +{ + return b_contig_data(&c->buf, co_data(c)); +} + +/* Initialize all fields in the channel. */ +static inline void channel_init(struct channel *chn) +{ + chn->buf = BUF_NULL; + chn->to_forward = 0; + chn->last_read = now_ms; + chn->xfer_small = chn->xfer_large = 0; + chn->total = 0; + chn->analysers = 0; + chn->flags = 0; + chn->output = 0; +} + +/* Schedule up to <bytes> more bytes to be forwarded via the channel without + * notifying the owner task. Any data pending in the buffer are scheduled to be + * sent as well, in the limit of the number of bytes to forward. This must be + * the only method to use to schedule bytes to be forwarded. If the requested + * number is too large, it is automatically adjusted. The number of bytes taken + * into account is returned. Directly touching ->to_forward will cause lockups + * when buf->o goes down to zero if nobody is ready to push the remaining data. + */ +static inline unsigned long long channel_forward(struct channel *chn, unsigned long long bytes) +{ + /* hint: avoid comparisons on long long for the fast case, since if the + * length does not fit in an unsigned it, it will never be forwarded at + * once anyway. + */ + if (bytes <= ~0U) { + unsigned int bytes32 = bytes; + + if (bytes32 <= ci_data(chn)) { + /* OK this amount of bytes might be forwarded at once */ + c_adv(chn, bytes32); + return bytes; + } + } + return __channel_forward(chn, bytes); +} + +/* Forwards any input data and marks the channel for permanent forwarding */ +static inline void channel_forward_forever(struct channel *chn) +{ + c_adv(chn, ci_data(chn)); + chn->to_forward = CHN_INFINITE_FORWARD; +} + +/* <len> bytes of input data was added into the channel <chn>. This functions + * must be called to update the channel state. It also handles the fast + * forwarding. */ +static inline void channel_add_input(struct channel *chn, unsigned int len) +{ + if (chn->to_forward) { + unsigned long fwd = len; + if (chn->to_forward != CHN_INFINITE_FORWARD) { + if (fwd > chn->to_forward) + fwd = chn->to_forward; + chn->to_forward -= fwd; + } + c_adv(chn, fwd); + } + /* notify that some data was read */ + chn->total += len; + chn->flags |= CF_READ_EVENT; +} + +static inline unsigned long long channel_htx_forward(struct channel *chn, struct htx *htx, unsigned long long bytes) +{ + unsigned long long ret = 0; + + if (htx->data) { + b_set_data(&chn->buf, htx->data); + ret = channel_forward(chn, bytes); + b_set_data(&chn->buf, b_size(&chn->buf)); + } + return ret; +} + + +static inline void channel_htx_forward_forever(struct channel *chn, struct htx *htx) +{ + c_adv(chn, htx->data - co_data(chn)); + chn->to_forward = CHN_INFINITE_FORWARD; +} +/*********************************************************************/ +/* These functions are used to compute various channel content sizes */ +/*********************************************************************/ + +/* Returns non-zero if the channel is rewritable, which means that the buffer + * it is attached to has at least <maxrewrite> bytes immediately available. + * This is used to decide when a request or response may be parsed when some + * data from a previous exchange might still be present. + */ +static inline int channel_is_rewritable(const struct channel *chn) +{ + int rem = chn->buf.size; + + rem -= b_data(&chn->buf); + rem -= global.tune.maxrewrite; + return rem >= 0; +} + +/* Tells whether data are likely to leave the buffer. This is used to know when + * we can safely ignore the reserve since we know we cannot retry a connection. + * It returns zero if data are blocked, non-zero otherwise. + */ +static inline int channel_may_send(const struct channel *chn) +{ + return chn_cons(chn)->state == SC_ST_EST; +} + +/* HTX version of channel_may_recv(). Returns non-zero if the channel can still + * receive data. */ +static inline int channel_htx_may_recv(const struct channel *chn, const struct htx *htx) +{ + uint32_t rem; + + if (!htx->size) + return 1; + + rem = htx_free_data_space(htx); + if (!rem) + return 0; /* htx already full */ + + if (rem > global.tune.maxrewrite) + return 1; /* reserve not yet reached */ + + if (!channel_may_send(chn)) + return 0; /* don't touch reserve until we can send */ + + /* Now we know there's some room left in the reserve and we may + * forward. As long as i-to_fwd < size-maxrw, we may still + * receive. This is equivalent to i+maxrw-size < to_fwd, + * which is logical since i+maxrw-size is what overlaps with + * the reserve, and we want to ensure they're covered by scheduled + * forwards. + */ + rem += co_data(chn); + if (rem > global.tune.maxrewrite) + return 1; + + return (global.tune.maxrewrite - rem < chn->to_forward); +} + +/* Returns non-zero if the channel can still receive data. This is used to + * decide when to stop reading into a buffer when we want to ensure that we + * leave the reserve untouched after all pending outgoing data are forwarded. + * The reserved space is taken into account if ->to_forward indicates that an + * end of transfer is close to happen. Note that both ->buf.o and ->to_forward + * are considered as available since they're supposed to leave the buffer. The + * test is optimized to avoid as many operations as possible for the fast case + * and to be used as an "if" condition. Just like channel_recv_limit(), we + * never allow to overwrite the reserve until the output stream connector is + * connected, otherwise we could spin on a POST with http-send-name-header. + */ +static inline int channel_may_recv(const struct channel *chn) +{ + int rem = chn->buf.size; + + if (IS_HTX_STRM(chn_strm(chn))) + return channel_htx_may_recv(chn, htxbuf(&chn->buf)); + + if (b_is_null(&chn->buf)) + return 1; + + rem -= b_data(&chn->buf); + if (!rem) + return 0; /* buffer already full */ + + if (rem > global.tune.maxrewrite) + return 1; /* reserve not yet reached */ + + if (!channel_may_send(chn)) + return 0; /* don't touch reserve until we can send */ + + /* Now we know there's some room left in the reserve and we may + * forward. As long as i-to_fwd < size-maxrw, we may still + * receive. This is equivalent to i+maxrw-size < to_fwd, + * which is logical since i+maxrw-size is what overlaps with + * the reserve, and we want to ensure they're covered by scheduled + * forwards. + */ + rem = ci_data(chn) + global.tune.maxrewrite - chn->buf.size; + return rem < 0 || (unsigned int)rem < chn->to_forward; +} + +/* Returns true if the channel's input is already closed */ +static inline int channel_input_closed(struct channel *chn) +{ + return ((chn_prod(chn)->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) != 0); +} + +/* Returns true if the channel's output is already closed */ +static inline int channel_output_closed(struct channel *chn) +{ + return ((chn_cons(chn)->flags & SC_FL_SHUT_DONE) != 0); +} + +/* Check channel timeouts, and set the corresponding flags. */ +static inline void channel_check_timeout(struct channel *chn) +{ + if (likely(!(chn->flags & CF_READ_EVENT)) && unlikely(tick_is_expired(chn->analyse_exp, now_ms))) + chn->flags |= CF_READ_EVENT; +} + + +/* Erase any content from channel <buf> and adjusts flags accordingly. Note + * that any spliced data is not affected since we may not have any access to + * it. + */ +static inline void channel_erase(struct channel *chn) +{ + chn->to_forward = 0; + chn->output = 0; + b_reset(&chn->buf); +} + +static inline void channel_htx_erase(struct channel *chn, struct htx *htx) +{ + htx_reset(htx); + channel_erase(chn); +} + + +/* marks the channel as "shutdown" ASAP in both directions */ +static inline void channel_abort(struct channel *chn) +{ + chn_prod(chn)->flags |= SC_FL_ABRT_WANTED; + chn_cons(chn)->flags |= SC_FL_SHUT_WANTED; + chn->flags |= CF_AUTO_CLOSE; + chn->flags &= ~CF_AUTO_CONNECT; +} + +/* allow the consumer to try to establish a new connection. */ +static inline void channel_auto_connect(struct channel *chn) +{ + chn->flags |= CF_AUTO_CONNECT; +} + +/* prevent the consumer from trying to establish a new connection, and also + * disable auto shutdown forwarding. + */ +static inline void channel_dont_connect(struct channel *chn) +{ + chn->flags &= ~(CF_AUTO_CONNECT|CF_AUTO_CLOSE); +} + +/* allow the producer to forward shutdown requests */ +static inline void channel_auto_close(struct channel *chn) +{ + chn->flags |= CF_AUTO_CLOSE; +} + +/* prevent the producer from forwarding shutdown requests */ +static inline void channel_dont_close(struct channel *chn) +{ + chn->flags &= ~CF_AUTO_CLOSE; +} + +/* allow the producer to read / poll the input */ +static inline void channel_auto_read(struct channel *chn) +{ + chn->flags &= ~CF_DONT_READ; +} + +/* prevent the producer from read / poll the input */ +static inline void channel_dont_read(struct channel *chn) +{ + chn->flags |= CF_DONT_READ; +} + + +/*************************************************/ +/* Buffer operations in the context of a channel */ +/*************************************************/ + + +/* Return the max number of bytes the buffer can contain so that once all the + * pending bytes are forwarded, the buffer still has global.tune.maxrewrite + * bytes free. The result sits between chn->size - maxrewrite and chn->size. + * It is important to mention that if buf->i is already larger than size-maxrw + * the condition above cannot be satisfied and the lowest size will be returned + * anyway. The principles are the following : + * 0) the empty buffer has a limit of zero + * 1) a non-connected buffer cannot touch the reserve + * 2) infinite forward can always fill the buffer since all data will leave + * 3) all output bytes are considered in transit since they're leaving + * 4) all input bytes covered by to_forward are considered in transit since + * they'll be converted to output bytes. + * 5) all input bytes not covered by to_forward as considered remaining + * 6) all bytes scheduled to be forwarded minus what is already in the input + * buffer will be in transit during future rounds. + * 7) 4+5+6 imply that the amount of input bytes (i) is irrelevant to the max + * usable length, only to_forward and output count. The difference is + * visible when to_forward > i. + * 8) the reserve may be covered up to the amount of bytes in transit since + * these bytes will only take temporary space. + * + * A typical buffer looks like this : + * + * <-------------- max_len -----------> + * <---- o ----><----- i -----> <--- 0..maxrewrite ---> + * +------------+--------------+-------+----------------------+ + * |////////////|\\\\\\\\\\\\\\|xxxxxxx| reserve | + * +------------+--------+-----+-------+----------------------+ + * <- fwd -> <-avail-> + * + * Or when to_forward > i : + * + * <-------------- max_len -----------> + * <---- o ----><----- i -----> <--- 0..maxrewrite ---> + * +------------+--------------+-------+----------------------+ + * |////////////|\\\\\\\\\\\\\\|xxxxxxx| reserve | + * +------------+--------+-----+-------+----------------------+ + * <-avail-> + * <------------------ fwd ----------------> + * + * - the amount of buffer bytes in transit is : min(i, fwd) + o + * - some scheduled bytes may be in transit (up to fwd - i) + * - the reserve is max(0, maxrewrite - transit) + * - the maximum usable buffer length is size - reserve. + * - the available space is max_len - i - o + * + * So the formula to compute the buffer's maximum length to protect the reserve + * when reading new data is : + * + * max = size - maxrewrite + min(maxrewrite, transit) + * = size - max(maxrewrite - transit, 0) + * + * But WARNING! The conditions might change during the transfer and it could + * very well happen that a buffer would contain more bytes than max_len due to + * i+o already walking over the reserve (eg: after a header rewrite), including + * i or o alone hitting the limit. So it is critical to always consider that + * bounds may have already been crossed and that available space may be negative + * for example. Due to this it is perfectly possible for this function to return + * a value that is lower than current i+o. + */ +static inline int channel_recv_limit(const struct channel *chn) +{ + unsigned int transit; + int reserve; + + /* return zero if empty */ + reserve = chn->buf.size; + if (b_is_null(&chn->buf)) + goto end; + + /* return size - maxrewrite if we can't send */ + reserve = global.tune.maxrewrite; + if (unlikely(!channel_may_send(chn))) + goto end; + + /* We need to check what remains of the reserve after o and to_forward + * have been transmitted, but they can overflow together and they can + * cause an integer underflow in the comparison since both are unsigned + * while maxrewrite is signed. + * The code below has been verified for being a valid check for this : + * - if (o + to_forward) overflow => return size [ large enough ] + * - if o + to_forward >= maxrw => return size [ large enough ] + * - otherwise return size - (maxrw - (o + to_forward)) + */ + transit = co_data(chn) + chn->to_forward; + reserve -= transit; + if (transit < chn->to_forward || // addition overflow + transit >= (unsigned)global.tune.maxrewrite) // enough transit data + return chn->buf.size; + end: + return chn->buf.size - reserve; +} + +/* HTX version of channel_recv_limit(). Return the max number of bytes the HTX + * buffer can contain so that once all the pending bytes are forwarded, the + * buffer still has global.tune.maxrewrite bytes free. + */ +static inline int channel_htx_recv_limit(const struct channel *chn, const struct htx *htx) +{ + unsigned int transit; + int reserve; + + /* return zeor if not allocated */ + if (!htx->size) + return 0; + + /* return max_data_space - maxrewrite if we can't send */ + reserve = global.tune.maxrewrite; + if (unlikely(!channel_may_send(chn))) + goto end; + + /* We need to check what remains of the reserve after o and to_forward + * have been transmitted, but they can overflow together and they can + * cause an integer underflow in the comparison since both are unsigned + * while maxrewrite is signed. + * The code below has been verified for being a valid check for this : + * - if (o + to_forward) overflow => return htx->size [ large enough ] + * - if o + to_forward >= maxrw => return htx->size [ large enough ] + * - otherwise return htx->size - (maxrw - (o + to_forward)) + */ + transit = co_data(chn) + chn->to_forward; + reserve -= transit; + if (transit < chn->to_forward || // addition overflow + transit >= (unsigned)global.tune.maxrewrite) // enough transit data + return htx->size; + end: + return (htx->size - reserve); +} + +/* HTX version of channel_full(). Instead of checking if INPUT data exceeds + * (size - reserve), this function checks if the free space for data in <htx> + * and the data scheduled for output are lower to the reserve. In such case, the + * channel is considered as full. + */ +static inline int channel_htx_full(const struct channel *c, const struct htx *htx, + unsigned int reserve) +{ + if (!htx->size) + return 0; + return (htx_free_data_space(htx) + co_data(c) <= reserve); +} + +/* Returns non-zero if the channel's INPUT buffer's is considered full, which + * means that it holds at least as much INPUT data as (size - reserve). This + * also means that data that are scheduled for output are considered as potential + * free space, and that the reserved space is always considered as not usable. + * This information alone cannot be used as a general purpose free space indicator. + * However it accurately indicates that too many data were fed in the buffer + * for an analyzer for instance. See the channel_may_recv() function for a more + * generic function taking everything into account. + */ +static inline int channel_full(const struct channel *c, unsigned int reserve) +{ + if (b_is_null(&c->buf)) + return 0; + + if (IS_HTX_STRM(chn_strm(c))) + return channel_htx_full(c, htxbuf(&c->buf), reserve); + + return (ci_data(c) + reserve >= c_size(c)); +} + +/* HTX version of channel_recv_max(). */ +static inline int channel_htx_recv_max(const struct channel *chn, const struct htx *htx) +{ + int ret; + + ret = channel_htx_recv_limit(chn, htx) - htx_used_space(htx); + if (ret < 0) + ret = 0; + return ret; +} + +/* Returns the amount of space available at the input of the buffer, taking the + * reserved space into account if ->to_forward indicates that an end of transfer + * is close to happen. The test is optimized to avoid as many operations as + * possible for the fast case. + */ +static inline int channel_recv_max(const struct channel *chn) +{ + int ret; + + if (IS_HTX_STRM(chn_strm(chn))) + return channel_htx_recv_max(chn, htxbuf(&chn->buf)); + + ret = channel_recv_limit(chn) - b_data(&chn->buf); + if (ret < 0) + ret = 0; + return ret; +} + +/* Returns the maximum absolute amount of data that can be copied in a channel, + * taking the reserved space into account but also the HTX overhead for HTX + * streams. + */ +static inline size_t channel_data_limit(const struct channel *chn) +{ + size_t max = (global.tune.bufsize - global.tune.maxrewrite); + + if (IS_HTX_STRM(chn_strm(chn))) + max -= HTX_BUF_OVERHEAD; + return max; +} + +/* Returns the amount of data in a channel, taking the HTX streams into + * account. For raw channels, it is equivalent to c_data. For HTX channels, we + * rely on the HTX api. + */ +static inline size_t channel_data(const struct channel *chn) +{ + return (IS_HTX_STRM(chn_strm(chn)) ? htx_used_space(htxbuf(&chn->buf)) : c_data(chn)); +} + +/* Returns the amount of input data in a channel, taking he HTX streams into + * account. This function relies on channel_data(). + */ +static inline size_t channel_input_data(const struct channel *chn) +{ + return channel_data(chn) - co_data(chn); +} + +/* Returns 1 if the channel is empty, taking he HTX streams into account */ +static inline size_t channel_empty(const struct channel *chn) +{ + return (IS_HTX_STRM(chn) ? htx_is_empty(htxbuf(&chn->buf)) : c_empty(chn)); +} + + +/* Returns the amount of bytes that can be written over the input data at once, + * including reserved space which may be overwritten. This is used by Lua to + * insert data in the input side just before the other data using buffer_replace(). + * The goal is to transfer these new data in the output buffer. + */ +static inline int ci_space_for_replace(const struct channel *chn) +{ + const struct buffer *buf = &chn->buf; + const char *end; + + /* If the input side data overflows, we cannot insert data contiguously. */ + if (b_head(buf) + b_data(buf) >= b_wrap(buf)) + return 0; + + /* Check the last byte used in the buffer, it may be a byte of the output + * side if the buffer wraps, or its the end of the buffer. + */ + end = b_head(buf); + if (end <= ci_head(chn)) + end = b_wrap(buf); + + /* Compute the amount of bytes which can be written. */ + return end - ci_tail(chn); +} + +/* Allocates a buffer for channel <chn>. Returns 0 in case of failure, non-zero + * otherwise. + * + * If no buffer are available, the requester, represented by <wait> pointer, + * will be added in the list of objects waiting for an available buffer. + */ +static inline int channel_alloc_buffer(struct channel *chn, struct buffer_wait *wait) +{ + if (b_alloc(&chn->buf) != NULL) + return 1; + + if (!LIST_INLIST(&wait->list)) + LIST_APPEND(&th_ctx->buffer_wq, &wait->list); + + return 0; +} + +/* Releases a possibly allocated buffer for channel <chn>. If it was not + * allocated, this function does nothing. Else the buffer is released and we try + * to wake up as many streams/applets as possible. */ +static inline void channel_release_buffer(struct channel *chn, struct buffer_wait *wait) +{ + if (c_size(chn) && c_empty(chn)) { + b_free(&chn->buf); + offer_buffers(wait->target, 1); + } +} + +/* Truncate any unread data in the channel's buffer, and disable forwarding. + * Outgoing data are left intact. This is mainly to be used to send error + * messages after existing data. + */ +static inline void channel_truncate(struct channel *chn) +{ + if (!co_data(chn)) + return channel_erase(chn); + + chn->to_forward = 0; + if (!ci_data(chn)) + return; + + chn->buf.data = co_data(chn); +} + +static inline void channel_htx_truncate(struct channel *chn, struct htx *htx) +{ + if (!co_data(chn)) + return channel_htx_erase(chn, htx); + + chn->to_forward = 0; + if (htx->data == co_data(chn)) + return; + htx_truncate(htx, co_data(chn)); +} + +/* This function realigns a possibly wrapping channel buffer so that the input + * part is contiguous and starts at the beginning of the buffer and the output + * part ends at the end of the buffer. This provides the best conditions since + * it allows the largest inputs to be processed at once and ensures that once + * the output data leaves, the whole buffer is available at once. + */ +static inline void channel_slow_realign(struct channel *chn, char *swap) +{ + return b_slow_realign(&chn->buf, swap, co_data(chn)); +} + + +/* Forward all headers of an HTX message, starting from the SL to the EOH. This + * function returns the position of the block after the EOH, if + * found. Otherwise, it returns -1. + */ +static inline int32_t channel_htx_fwd_headers(struct channel *chn, struct htx *htx) +{ + int32_t pos; + size_t data = 0; + + for (pos = htx_get_first(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_blk *blk = htx_get_blk(htx, pos); + data += htx_get_blksz(blk); + if (htx_get_blk_type(blk) == HTX_BLK_EOH) { + pos = htx_get_next(htx, pos); + break; + } + } + c_adv(chn, data); + return pos; +} + +/* + * Advance the channel buffer's read pointer by <len> bytes. This is useful + * when data have been read directly from the buffer. It is illegal to call + * this function with <len> causing a wrapping at the end of the buffer. It's + * the caller's responsibility to ensure that <len> is never larger than + * chn->o. + */ +static inline void co_skip(struct channel *chn, int len) +{ + BUG_ON_HOT(len > chn->output); + b_del(&chn->buf, len); + chn->output -= len; + c_realign_if_empty(chn); +} + +/* HTX version of co_skip(). This function skips at most <len> bytes from the + * output of the channel <chn>. Depending on how data are stored in <htx> less + * than <len> bytes can be skipped.. + */ +static inline void co_htx_skip(struct channel *chn, struct htx *htx, int len) +{ + struct htx_ret htxret; + + htxret = htx_drain(htx, len); + if (htxret.ret) { + BUG_ON_HOT(htxret.ret > chn->output); + chn->output -= htxret.ret; + } +} + +/* Tries to copy chunk <chunk> into the channel's buffer after length controls. + * The chn->o and to_forward pointers are updated. If the channel's input is + * closed, -2 is returned. If the block is too large for this buffer, -3 is + * returned. If there is not enough room left in the buffer, -1 is returned. + * Otherwise the number of bytes copied is returned (0 being a valid number). + * Channel flag READ_PARTIAL is updated if some data can be transferred. The + * chunk's length is updated with the number of bytes sent. + */ +static inline int ci_putchk(struct channel *chn, struct buffer *chunk) +{ + int ret; + + ret = ci_putblk(chn, chunk->area, chunk->data); + if (ret > 0) + chunk->data -= ret; + return ret; +} + +/* Tries to copy string <str> at once into the channel's buffer after length + * controls. The chn->o and to_forward pointers are updated. If the channel's + * input is closed, -2 is returned. If the block is too large for this buffer, + * -3 is returned. If there is not enough room left in the buffer, -1 is + * returned. Otherwise the number of bytes copied is returned (0 being a valid + * number). Channel flag READ_PARTIAL is updated if some data can be + * transferred. + */ +static inline int ci_putstr(struct channel *chn, const char *str) +{ + return ci_putblk(chn, str, strlen(str)); +} + +/* + * Return one char from the channel's buffer. If the buffer is empty and the + * channel is closed, return -2. If the buffer is just empty, return -1. The + * buffer's pointer is not advanced, it's up to the caller to call co_skip(buf, + * 1) when it has consumed the char. Also note that this function respects the + * chn->o limit. + */ +static inline int co_getchr(struct channel *chn) +{ + /* closed or empty + imminent close = -2; empty = -1 */ + if (unlikely((chn_cons(chn)->flags & SC_FL_SHUT_DONE) || !co_data(chn))) { + if (chn_cons(chn)->flags & (SC_FL_SHUT_DONE|SC_FL_SHUT_WANTED)) + return -2; + return -1; + } + return *co_head(chn); +} + +#endif /* _HAPROXY_CHANNEL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/check-t.h b/include/haproxy/check-t.h new file mode 100644 index 0000000..eb080a9 --- /dev/null +++ b/include/haproxy/check-t.h @@ -0,0 +1,198 @@ +/* + * include/haproxy/check-t.h + * Health-checks definitions, enums, macros and bitfields. + * + * Copyright 2008-2009 Krzysztof Piotr Oledzki <ole@ans.pl> + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_CHECKS_T_H +#define _HAPROXY_CHECKS_T_H + +#include <sys/time.h> + +#include <import/ebtree-t.h> +#include <import/ist.h> +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/dynbuf-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/vars-t.h> + +/* Please note: this file tends to commonly be part of circular dependencies, + * so it is important to keep its includes list to the minimum possible (i.e. + * only types whose size needs to be known). Since there are no function + * prototypes nor pointers here, forward declarations are not really necessary. + * This file oughtt to be split into multiple parts, at least regular checks vs + * tcp-checks. + */ + +/* enum used by check->result. Must remain in this order, as some code uses + * result >= CHK_RES_PASSED to declare success. + */ +enum chk_result { + CHK_RES_UNKNOWN = 0, /* initialized to this by default */ + CHK_RES_NEUTRAL, /* valid check but no status information */ + CHK_RES_FAILED, /* check failed */ + CHK_RES_PASSED, /* check succeeded and server is fully up again */ + CHK_RES_CONDPASS, /* check reports the server doesn't want new sessions */ +}; + +/* flags used by check->state */ +#define CHK_ST_INPROGRESS 0x0001 /* a check is currently running */ +#define CHK_ST_CONFIGURED 0x0002 /* this check is configured and may be enabled */ +#define CHK_ST_ENABLED 0x0004 /* this check is currently administratively enabled */ +#define CHK_ST_PAUSED 0x0008 /* checks are paused because of maintenance (health only) */ +#define CHK_ST_AGENT 0x0010 /* check is an agent check (otherwise it's a health check) */ +#define CHK_ST_PORT_MISS 0x0020 /* check can't be send because no port is configured to run it */ +#define CHK_ST_IN_ALLOC 0x0040 /* check blocked waiting for input buffer allocation */ +#define CHK_ST_OUT_ALLOC 0x0080 /* check blocked waiting for output buffer allocation */ +#define CHK_ST_CLOSE_CONN 0x0100 /* check is waiting that the connection gets closed */ +#define CHK_ST_PURGE 0x0200 /* check must be freed */ +#define CHK_ST_FASTINTER 0x0400 /* force fastinter check */ +#define CHK_ST_READY 0x0800 /* check ready to migrate or run, see below */ +#define CHK_ST_SLEEPING 0x1000 /* check was sleeping, i.e. not currently bound to a thread, see below */ + +/* 4 possible states for CHK_ST_SLEEPING and CHK_ST_READY: + * SLP RDY State Description + * 0 0 QUEUED Check is in queue due to concurrency limit + * 0 1 RUNNING Check is bound to current thread and running + * 1 0 SLEEPING Check is sleeping, not bound to a thread + * 1 1 MIGRATING Check is migrating to another thread + */ + +/* check status */ +enum healthcheck_status { + HCHK_STATUS_UNKNOWN = 0, /* Unknown */ + HCHK_STATUS_INI, /* Initializing */ + HCHK_STATUS_START, /* Check started - SPECIAL STATUS */ + + /* Below we have finished checks */ + HCHK_STATUS_CHECKED, /* DUMMY STATUS */ + + HCHK_STATUS_HANA, /* Health analyze detected enough consecutive errors */ + + HCHK_STATUS_SOCKERR, /* Socket error */ + + HCHK_STATUS_L4OK, /* L4 check passed, for example tcp connect */ + HCHK_STATUS_L4TOUT, /* L4 timeout */ + HCHK_STATUS_L4CON, /* L4 connection problem, for example: */ + /* "Connection refused" (tcp rst) or "No route to host" (icmp) */ + + HCHK_STATUS_L6OK, /* L6 check passed */ + HCHK_STATUS_L6TOUT, /* L6 (SSL) timeout */ + HCHK_STATUS_L6RSP, /* L6 invalid response - protocol error */ + + HCHK_STATUS_L7TOUT, /* L7 (HTTP/SMTP) timeout */ + HCHK_STATUS_L7RSP, /* L7 invalid response - protocol error */ + + /* Below we have layer 5-7 data available */ + HCHK_STATUS_L57DATA, /* DUMMY STATUS */ + HCHK_STATUS_L7OKD, /* L7 check passed */ + HCHK_STATUS_L7OKCD, /* L7 check conditionally passed */ + HCHK_STATUS_L7STS, /* L7 response error, for example HTTP 5xx */ + + HCHK_STATUS_PROCERR, /* External process check failure */ + HCHK_STATUS_PROCTOUT, /* External process check timeout */ + HCHK_STATUS_PROCOK, /* External process check passed */ + + HCHK_STATUS_SIZE +}; + +/* health status for response tracking */ +enum { + HANA_STATUS_UNKNOWN = 0, + + HANA_STATUS_L4_OK, /* L4 successful connection */ + HANA_STATUS_L4_ERR, /* L4 unsuccessful connection */ + + HANA_STATUS_HTTP_OK, /* Correct http response */ + HANA_STATUS_HTTP_STS, /* Wrong http response, for example HTTP 5xx */ + HANA_STATUS_HTTP_HDRRSP, /* Invalid http response (headers) */ + HANA_STATUS_HTTP_RSP, /* Invalid http response */ + + HANA_STATUS_HTTP_READ_ERROR, /* Read error */ + HANA_STATUS_HTTP_READ_TIMEOUT, /* Read timeout */ + HANA_STATUS_HTTP_BROKEN_PIPE, /* Unexpected close from server */ + + HANA_STATUS_SIZE +}; + +enum { + HANA_ONERR_UNKNOWN = 0, + + HANA_ONERR_FASTINTER, /* Force fastinter*/ + HANA_ONERR_FAILCHK, /* Simulate a failed check */ + HANA_ONERR_SUDDTH, /* Enters sudden death - one more failed check will mark this server down */ + HANA_ONERR_MARKDWN, /* Mark this server down, now! */ +}; + +enum { + HANA_ONMARKEDDOWN_NONE = 0, + HANA_ONMARKEDDOWN_SHUTDOWNSESSIONS, /* Shutdown peer sessions */ +}; + +enum { + HANA_ONMARKEDUP_NONE = 0, + HANA_ONMARKEDUP_SHUTDOWNBACKUPSESSIONS, /* Shutdown peer sessions */ +}; + +enum { + HANA_OBS_NONE = 0, + + HANA_OBS_LAYER4, /* Observe L4 - for example tcp */ + HANA_OBS_LAYER7, /* Observe L7 - for example http */ + + HANA_OBS_SIZE +}; + +struct tcpcheck_rule; +struct tcpcheck_rules; + +struct check { + enum obj_type obj_type; /* object type == OBJ_TYPE_CHECK */ + struct session *sess; /* Health check session. */ + struct vars vars; /* Health check dynamic variables. */ + struct xprt_ops *xprt; /* transport layer operations for health checks */ + struct stconn *sc; /* stream connector used by health checks */ + struct buffer bi, bo; /* input and output buffers to send/recv check */ + struct buffer_wait buf_wait; /* Wait list for buffer allocation */ + struct task *task; /* the task associated to the health check processing, NULL if disabled */ + ullong start; /* last health check start time */ + long duration; /* time in ms took to finish last health check */ + short status, code; /* check result, check code */ + unsigned short port; /* the port to use for the health checks */ + char desc[HCHK_DESC_LEN]; /* health check description */ + signed char use_ssl; /* use SSL for health checks (1: on, 0: server mode, -1: off) */ + int send_proxy; /* send a PROXY protocol header with checks */ + struct tcpcheck_rules *tcpcheck_rules; /* tcp-check send / expect rules */ + struct tcpcheck_rule *current_step; /* current step when using tcpcheck */ + int inter, fastinter, downinter; /* checks: time in milliseconds */ + enum chk_result result; /* health-check result : CHK_RES_* */ + int state; /* state of the check : CHK_ST_* */ + int health; /* 0 to rise-1 = bad; + * rise to rise+fall-1 = good */ + int rise, fall; /* time in iterations */ + int type; /* Check type, one of PR_O2_*_CHK */ + struct server *server; /* back-pointer to server */ + struct proxy *proxy; /* proxy to be used */ + char **argv; /* the arguments to use if running a process-based check */ + char **envp; /* the environment to use if running a process-based check */ + struct pid_list *curpid; /* entry in pid_list used for current process-based test, or -1 if not in test */ + struct sockaddr_storage addr; /* the address to check */ + char *sni; /* Server name */ + char *alpn_str; /* ALPN to use for checks */ + int alpn_len; /* ALPN string length */ + const struct mux_proto_list *mux_proto; /* the mux to use for all outgoing connections (specified by the "proto" keyword) */ + struct list check_queue; /* entry in the check queue. Not empty = in queue. */ + int via_socks4; /* check the connection via socks4 proxy */ +}; + +#endif /* _HAPROXY_CHECKS_T_H */ diff --git a/include/haproxy/check.h b/include/haproxy/check.h new file mode 100644 index 0000000..c90d3e7 --- /dev/null +++ b/include/haproxy/check.h @@ -0,0 +1,131 @@ +/* + * include/haproxy/check.h + * Functions prototypes for the checks. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CHECKS_H +#define _HAPROXY_CHECKS_H + +#include <haproxy/check-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> +#include <haproxy/trace-t.h> + +extern struct trace_source trace_check; + +/* Details about these events are defined in <src/check.c> */ +#define CHK_EV_TASK_WAKE (1ULL << 0) +#define CHK_EV_HCHK_START (1ULL << 1) +#define CHK_EV_HCHK_WAKE (1ULL << 2) +#define CHK_EV_HCHK_RUN (1ULL << 3) +#define CHK_EV_HCHK_END (1ULL << 4) +#define CHK_EV_HCHK_SUCC (1ULL << 5) +#define CHK_EV_HCHK_ERR (1ULL << 6) +#define CHK_EV_HCHK (CHK_EV_HCHK_START|CHK_EV_HCHK_WAKE|CHK_EV_HCHK_RUN|\ + CHK_EV_HCHK_END|CHK_EV_HCHK_SUCC|CHK_EV_HCHK_ERR) + +#define CHK_EV_TCPCHK_EVAL (1ULL << 7) +#define CHK_EV_TCPCHK_ERR (1ULL << 8) +#define CHK_EV_TCPCHK_CONN (1ULL << 9) +#define CHK_EV_TCPCHK_SND (1ULL << 10) +#define CHK_EV_TCPCHK_EXP (1ULL << 11) +#define CHK_EV_TCPCHK_ACT (1ULL << 12) +#define CHK_EV_TCPCHK (CHK_EV_TCPCHK_EVAL|CHK_EV_TCPCHK_ERR|CHK_EV_TCPCHK_CONN|\ + CHK_EV_TCPCHK_SND|CHK_EV_TCPCHK_EXP|CHK_EV_TCPCHK_ACT) + +#define CHK_EV_RX_DATA (1ULL << 13) +#define CHK_EV_RX_BLK (1ULL << 14) +#define CHK_EV_RX_ERR (1ULL << 15) +#define CHK_EV_RX (CHK_EV_RX_DATA|CHK_EV_RX_BLK|CHK_EV_RX_ERR) + +#define CHK_EV_TX_DATA (1ULL << 16) +#define CHK_EV_TX_BLK (1ULL << 17) +#define CHK_EV_TX_ERR (1ULL << 18) +#define CHK_EV_TX (CHK_EV_TX_DATA|CHK_EV_TX_BLK|CHK_EV_TX_ERR) + +extern struct data_cb check_conn_cb; +extern struct proxy checks_fe; + +short get_check_status_result(short check_status); +const char *get_check_status_description(short check_status); +const char *get_check_status_info(short check_status); +int httpchk_build_status_header(struct server *s, struct buffer *buf); +void __health_adjust(struct server *s, short status); +void check_append_info(struct buffer *msg, struct check *check); +void set_server_check_status(struct check *check, short status, const char *desc); +void chk_report_conn_err(struct check *check, int errno_bck, int expired); +void check_notify_failure(struct check *check); +void check_notify_stopping(struct check *check); +void check_notify_success(struct check *check); +struct task *process_chk(struct task *t, void *context, unsigned int state); + +struct task *srv_chk_io_cb(struct task *t, void *ctx, unsigned int state); + +int check_buf_available(void *target); +struct buffer *check_get_buf(struct check *check, struct buffer *bptr); +void check_release_buf(struct check *check, struct buffer *bptr); +const char *init_check(struct check *check, int type); +void free_check(struct check *check); +void check_purge(struct check *check); +int wake_srv_chk(struct stconn *sc); + +int init_srv_check(struct server *srv); +int init_srv_agent_check(struct server *srv); +int start_check_task(struct check *check, int mininter, int nbcheck, int srvpos); + +/* Declared here, but the definitions are in flt_spoe.c */ +int spoe_prepare_healthcheck_request(char **req, int *len); +int spoe_handle_healthcheck_response(char *frame, size_t size, char *err, int errlen); + +int set_srv_agent_send(struct server *srv, const char *send); + +/* set agent addr and appropriate flag */ +static inline void set_srv_agent_addr(struct server *srv, struct sockaddr_storage *sk) +{ + srv->agent.addr = *sk; + srv->flags |= SRV_F_AGENTADDR; +} + +/* set agent port and appropriate flag */ +static inline void set_srv_agent_port(struct server *srv, int port) +{ + srv->agent.port = port; + srv->flags |= SRV_F_AGENTPORT; +} + +/* Use this one only. This inline version only ensures that we don't + * call the function when the observe mode is disabled. + */ +static inline void health_adjust(struct server *s, short status) +{ + /* return now if observing nor health check is not enabled */ + if (!s->observe || !s->check.task) + return; + + __health_adjust(s, status); +} + +#endif /* _HAPROXY_CHECKS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/chunk.h b/include/haproxy/chunk.h new file mode 100644 index 0000000..43c7270 --- /dev/null +++ b/include/haproxy/chunk.h @@ -0,0 +1,303 @@ +/* + * include/haproxy/chunk.h + * Chunk management definitions, macros and inline functions. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CHUNK_H +#define _HAPROXY_CHUNK_H + +#include <stdlib.h> +#include <string.h> + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/buf-t.h> +#include <haproxy/pool.h> + + +extern struct pool_head *pool_head_trash; + +/* function prototypes */ + +int chunk_printf(struct buffer *chk, const char *fmt, ...) + __attribute__ ((format(printf, 2, 3))); + +int chunk_appendf(struct buffer *chk, const char *fmt, ...) + __attribute__ ((format(printf, 2, 3))); + +int chunk_htmlencode(struct buffer *dst, struct buffer *src); +int chunk_asciiencode(struct buffer *dst, struct buffer *src, char qc); +int chunk_strcmp(const struct buffer *chk, const char *str); +int chunk_strcasecmp(const struct buffer *chk, const char *str); +struct buffer *get_trash_chunk(void); +int init_trash_buffers(int first); + +static inline void chunk_reset(struct buffer *chk) +{ + chk->data = 0; +} + +static inline void chunk_init(struct buffer *chk, char *str, size_t size) +{ + chk->area = str; + chk->head = 0; + chk->data = 0; + chk->size = size; +} + +/* report 0 in case of error, 1 if OK. */ +static inline int chunk_initlen(struct buffer *chk, char *str, size_t size, + int len) +{ + + if (len < 0 || (size && len > size)) + return 0; + + chk->area = str; + chk->head = 0; + chk->data = len; + chk->size = size; + + return 1; +} + +/* this is only for temporary manipulation, the chunk is read-only */ +static inline void chunk_initstr(struct buffer *chk, const char *str) +{ + chk->area = (char *)str; + chk->head = 0; + chk->data = strlen(str); + chk->size = 0; /* mark it read-only */ +} + +/* + * Allocate a trash chunk from the reentrant pool. The buffer starts at the + * end of the chunk. This chunk must be freed using free_trash_chunk(). This + * call may fail and the caller is responsible for checking that the returned + * pointer is not NULL. + */ +static forceinline struct buffer *alloc_trash_chunk(void) +{ + struct buffer *chunk; + + chunk = pool_alloc(pool_head_trash); + if (chunk) { + char *buf = (char *)chunk + sizeof(struct buffer); + *buf = 0; + chunk_init(chunk, buf, + pool_head_trash->size - sizeof(struct buffer)); + } + return chunk; +} + +/* + * free a trash chunk allocated by alloc_trash_chunk(). NOP on NULL. + */ +static forceinline void free_trash_chunk(struct buffer *chunk) +{ + pool_free(pool_head_trash, chunk); +} + +/* copies chunk <src> into <chk>. Returns 0 in case of failure. */ +static inline int chunk_cpy(struct buffer *chk, const struct buffer *src) +{ + if (unlikely(src->data > chk->size)) + return 0; + + chk->data = src->data; + memcpy(chk->area, src->area, src->data); + return 1; +} + +/* copies memory area <src> into <chk> for <len> bytes. Returns 0 in + * case of failure. No trailing zero is added. + */ +static inline int chunk_memcpy(struct buffer *chk, const char *src, + size_t len) +{ + if (unlikely(len > chk->size)) + return 0; + + chk->data = len; + memcpy(chk->area, src, len); + + return 1; +} + +/* appends memory area <src> after <chk> for <len> bytes. Returns 0 in + * case of failure. No trailing zero is added. + */ +static inline int chunk_memcat(struct buffer *chk, const char *src, + size_t len) +{ + if (unlikely(chk->data + len > chk->size)) + return 0; + + memcpy(chk->area + chk->data, src, len); + chk->data += len; + return 1; +} + +/* appends ist <src> after <chk>. Returns 0 in case of failure. */ +static inline int chunk_istcat(struct buffer *chk, const struct ist src) +{ + return chunk_memcat(chk, istptr(src), istlen(src)); +} + +/* appends chunk <src> after <chk>. Returns 0 in case of failure. */ +static inline int chunk_cat(struct buffer *chk, const struct buffer *src) +{ + return chunk_memcat(chk, src->area, src->data); +} + +/* copies str into <chk> followed by a trailing zero. Returns 0 in + * case of failure. + */ +static inline int chunk_strcpy(struct buffer *chk, const char *str) +{ + size_t len; + + len = strlen(str); + + if (unlikely(len >= chk->size)) + return 0; + + chk->data = len; + memcpy(chk->area, str, len + 1); + + return 1; +} + +/* copies at most <max> chars from str into <chk> followed by a trailing zero. + * Returns 0 in case of failure. + */ +static inline int chunk_strncpy(struct buffer *chk, const char *str, size_t max) +{ + size_t len; + + len = strlen(str); + if (len > max) + len = max; + + if (unlikely(len >= chk->size)) + return 0; + + memcpy(chk->area, str, len); + chk->area[len] = 0; + chk->data = len; + return 1; +} + +/* appends str after <chk> followed by a trailing zero. Returns 0 in + * case of failure. + */ +static inline int chunk_strcat(struct buffer *chk, const char *str) +{ + size_t len; + + len = strlen(str); + + if (unlikely(chk->data + len >= chk->size)) + return 0; + + memcpy(chk->area + chk->data, str, len + 1); + chk->data += len; + return 1; +} + +/* Adds a trailing zero to the current chunk and returns the pointer to the + * following part. The purpose is to be able to use a chunk as a series of + * short independent strings with chunk_* functions, which do not need to be + * released. Returns NULL if no space is available to ensure that the new + * string will have its own trailing zero. For example : + * chunk_init(&trash); + * pid = chunk_newstr(&trash); + * chunk_appendf(&trash, "%d", getpid())); + * name = chunk_newstr(&trash); + * chunk_appendf(&trash, "%s", gethosname()); + * printf("hostname=<%s>, pid=<%d>\n", name, pid); + */ +static inline char *chunk_newstr(struct buffer *chk) +{ + if (chk->data + 1 >= chk->size) + return NULL; + + chk->area[chk->data++] = 0; + return chk->area + chk->data; +} + +static inline void chunk_drop(struct buffer *chk) +{ + chk->area = NULL; + chk->data = -1; + chk->size = 0; +} + +static inline void chunk_destroy(struct buffer *chk) +{ + if (!chk->size) + return; + + free(chk->area); + chunk_drop(chk); +} + +/* + * frees the destination chunk if already allocated, allocates a new string, + * and copies the source into it. The new chunk will have extra room for a + * trailing zero unless the source chunk was actually full. The pointer to + * the destination string is returned, or NULL if the allocation fails or if + * any pointer is NULL. + */ +static inline char *chunk_dup(struct buffer *dst, const struct buffer *src) +{ + if (!dst || !src || !src->area) + return NULL; + + if (dst->size) + free(dst->area); + dst->head = src->head; + dst->data = src->data; + dst->size = src->data; + if (dst->size < src->size || !src->size) + dst->size++; + + dst->area = malloc(dst->size); + if (!dst->area) { + dst->head = 0; + dst->data = 0; + dst->size = 0; + return NULL; + } + + memcpy(dst->area, src->area, dst->data); + if (dst->data < dst->size) + dst->area[dst->data] = 0; + + return dst->area; +} + +#endif /* _HAPROXY_CHUNK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/cli-t.h b/include/haproxy/cli-t.h new file mode 100644 index 0000000..c155df3 --- /dev/null +++ b/include/haproxy/cli-t.h @@ -0,0 +1,100 @@ +/* + * include/haproxy/cli-t.h + * This file provides structures and types for CLI. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CLI_T_H +#define _HAPROXY_CLI_T_H + +#include <haproxy/applet-t.h> + +/* Access level for a stats socket (appctx->cli_level) */ +#define ACCESS_LVL_NONE 0x0000 +#define ACCESS_LVL_USER 0x0001 +#define ACCESS_LVL_OPER 0x0002 +#define ACCESS_LVL_ADMIN 0x0003 +#define ACCESS_LVL_MASK 0x0003 + +#define ACCESS_FD_LISTENERS 0x0004 /* expose listeners FDs on stats socket */ +#define ACCESS_MASTER 0x0008 /* works with the master (and every other processes) */ +#define ACCESS_MASTER_ONLY 0x0010 /* only works with the master */ +#define ACCESS_EXPERT 0x0020 /* access to dangerous commands reserved to experts */ +#define ACCESS_EXPERIMENTAL 0x0040 +#define ACCESS_MCLI_DEBUG 0x0080 /* allow the master CLI to use any command without the flag ACCESS_MASTER */ +#define ACCESS_MCLI_SEVERITY_NB 0x0100 /* 'set severity-output number' on master CLI */ +#define ACCESS_MCLI_SEVERITY_STR 0x0200 /* 'set severity-output string' on master CLI */ + +/* flags for appctx->st1 */ +#define APPCTX_CLI_ST1_PROMPT (1 << 0) +#define APPCTX_CLI_ST1_PAYLOAD (1 << 1) +#define APPCTX_CLI_ST1_NOLF (1 << 2) +#define APPCTX_CLI_ST1_TIMED (1 << 3) + +#define CLI_PREFIX_KW_NB 5 +#define CLI_MAX_MATCHES 5 +#define CLI_MAX_HELP_ENTRIES 1024 + +/* CLI states */ +enum { + CLI_ST_INIT = 0, /* initial state, must leave to zero ! */ + CLI_ST_END, /* final state, let's close */ + CLI_ST_GETREQ, /* wait for a request */ + CLI_ST_OUTPUT, /* all states after this one are responses */ + CLI_ST_PROMPT, /* display the prompt (first output, same code) */ + CLI_ST_PRINT, /* display const message in cli->msg */ + CLI_ST_PRINT_ERR, /* display const error in cli->msg */ + CLI_ST_PRINT_DYN, /* display dynamic message in cli->err. After the display, free the pointer */ + CLI_ST_PRINT_DYNERR, /* display dynamic error in cli->err. After the display, free the pointer */ + CLI_ST_PRINT_UMSG, /* display usermsgs_ctx buffer. After the display, usermsgs_ctx is reset. */ + CLI_ST_PRINT_UMSGERR, /* display usermsgs_ctx buffer as error. After the display, usermsgs_ctx is reset. */ + CLI_ST_CALLBACK, /* custom callback pointer */ +}; + +/* CLI severity output formats */ +enum { + CLI_SEVERITY_UNDEFINED = 0, /* undefined severity format */ + CLI_SEVERITY_NONE, /* no severity information prepended */ + CLI_SEVERITY_NUMBER, /* prepend informational cli messages with a severity as number */ + CLI_SEVERITY_STRING, /* prepend informational cli messages with a severity as string */ +}; + +/* CLI context for printing command responses. */ +struct cli_print_ctx { + const char *msg; /* pointer to a persistent message to be returned in CLI_ST_PRINT state */ + char *err; /* pointer to a 'must free' message to be returned in CLI_ST_PRINT_DYN state */ + int severity; /* severity of the message to be returned according to (syslog) rfc5424 */ +}; + +struct cli_kw { + const char *str_kw[CLI_PREFIX_KW_NB]; /* keywords ended by NULL, limited to CLI_PREFIX_KW_NB + separated keywords combination */ + const char *usage; /* usage message */ + int (*parse)(char **args, char *payload, struct appctx *appctx, void *private); + int (*io_handler)(struct appctx *appctx); + void (*io_release)(struct appctx *appctx); + void *private; + int level; /* this is the level needed to show the keyword usage and to use it */ +}; + +struct cli_kw_list { + struct list list; + struct cli_kw kw[VAR_ARRAY]; +}; + +#endif /* _HAPROXY_CLI_T_H */ diff --git a/include/haproxy/cli.h b/include/haproxy/cli.h new file mode 100644 index 0000000..32c6599 --- /dev/null +++ b/include/haproxy/cli.h @@ -0,0 +1,138 @@ +/* + * include/haproxy/cli.h + * This file contains definitions of some primitives to dedicated to + * statistics output. + * + * Copyright (C) 2000-2011 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CLI_H +#define _HAPROXY_CLI_H + +#include <haproxy/applet.h> +#include <haproxy/channel-t.h> +#include <haproxy/cli-t.h> +#include <haproxy/global.h> +#include <haproxy/mworker-t.h> +#include <haproxy/stream-t.h> + + +void cli_register_kw(struct cli_kw_list *kw_list); +struct cli_kw* cli_find_kw_exact(char **args); +void cli_list_keywords(void); + +int cli_has_level(struct appctx *appctx, int level); + +int cli_parse_default(char **args, char *payload, struct appctx *appctx, void *private); + +/* mworker proxy functions */ + +int mworker_cli_proxy_create(void); +struct bind_conf *mworker_cli_proxy_new_listener(char *line); +int mworker_cli_sockpair_new(struct mworker_proc *mworker_proc, int proc); +void mworker_cli_proxy_stop(void); + +extern struct bind_conf *mcli_reload_bind_conf; + +/* proxy mode cli functions */ + +/* analyzers */ +int pcli_wait_for_request(struct stream *s, struct channel *req, int an_bit); +int pcli_wait_for_response(struct stream *s, struct channel *rep, int an_bit); + +/* updates the CLI's context to log <msg> at <severity> and returns 1. This is + * for use in CLI parsers to deal with quick response messages. + */ +static inline int cli_msg(struct appctx *appctx, int severity, const char *msg) +{ + struct cli_print_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + ctx->severity = severity; + ctx->msg = msg; + appctx->st0 = CLI_ST_PRINT; + return 1; +} + +/* updates the CLI's context to log error message <err> and returns 1. The + * message will be logged at level LOG_ERR. This is for use in CLI parsers to + * deal with quick response messages. + */ +static inline int cli_err(struct appctx *appctx, const char *err) +{ + struct cli_print_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + ctx->msg = err; + appctx->st0 = CLI_ST_PRINT_ERR; + return 1; +} + +/* updates the CLI's context to log <msg> at <severity> and returns 1. The + * message must have been dynamically allocated and will be freed. This is + * for use in CLI parsers to deal with quick response messages. + */ +static inline int cli_dynmsg(struct appctx *appctx, int severity, char *msg) +{ + struct cli_print_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + ctx->severity = severity; + ctx->err = msg; + appctx->st0 = CLI_ST_PRINT_DYN; + return 1; +} + +/* updates the CLI's context to log error message <err> and returns 1. The + * message must have been dynamically allocated and will be freed. The message + * will be logged at level LOG_ERR. This is for use in CLI parsers to deal with + * quick response messages. + */ +static inline int cli_dynerr(struct appctx *appctx, char *err) +{ + struct cli_print_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + ctx->err = err; + appctx->st0 = CLI_ST_PRINT_DYNERR; + return 1; +} + +/* updates the CLI's context to log messages stored in thread-local + * usermsgs_ctx at <severity> level. usermsgs_ctx will be reset when done. + * This is for use in CLI parsers to deal with quick response messages. + * + * Always returns 1. + */ +static inline int cli_umsg(struct appctx *appctx, int severity) +{ + struct cli_print_ctx *ctx = applet_reserve_svcctx(appctx, sizeof(*ctx)); + + ctx->severity = severity; + appctx->st0 = CLI_ST_PRINT_UMSG; + return 1; +} + +/* updates the CLI's context to log messages stored in thread-local + * usermsgs_ctx using error level. usermsgs_ctx will be reset when done. + * This is for use in CLI parsers to deal with quick response messages. + * + * Always returns 1. + */ +static inline int cli_umsgerr(struct appctx *appctx) +{ + appctx->st0 = CLI_ST_PRINT_UMSGERR; + return 1; +} + +#endif /* _HAPROXY_CLI_H */ diff --git a/include/haproxy/clock.h b/include/haproxy/clock.h new file mode 100644 index 0000000..264363e --- /dev/null +++ b/include/haproxy/clock.h @@ -0,0 +1,59 @@ +/* + * include/haproxy/clock.h + * Exported parts for time-keeping + * + * Copyright (C) 2000-2021 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CLOCK_H +#define _HAPROXY_CLOCK_H + +#include <sys/time.h> +#include <haproxy/api.h> + +extern struct timeval start_date; /* the process's start date in wall-clock time */ +extern struct timeval ready_date; /* date when the process was considered ready */ +extern ullong start_time_ns; /* the process's start date in internal monotonic time (ns) */ +extern volatile ullong global_now_ns; /* common monotonic date between all threads, in ns (wraps every 585 yr) */ + +extern THREAD_LOCAL ullong now_ns; /* internal monotonic date derived from real clock, in ns (wraps every 585 yr) */ +extern THREAD_LOCAL struct timeval date; /* the real current date (wall-clock time) */ + +uint64_t now_cpu_time_thread(int thr); +uint64_t now_mono_time(void); +uint64_t now_mono_time_fast(void); +uint64_t now_cpu_time(void); +uint64_t now_cpu_time_fast(void); +void clock_set_local_source(void); +void clock_update_local_date(int max_wait, int interrupted); +void clock_update_global_date(); +void clock_init_process_date(void); +void clock_init_thread_date(void); +int clock_setup_signal_timer(void *timer, int sig, int val); +char *timeofday_as_iso_us(int pad); +uint clock_report_idle(void); +void clock_leaving_poll(int timeout, int interrupted); +void clock_entering_poll(void); +void clock_adjust_now_offset(void); + +static inline void clock_update_date(int max_wait, int interrupted) +{ + clock_update_local_date(max_wait, interrupted); + clock_update_global_date(); +} + +#endif diff --git a/include/haproxy/compat.h b/include/haproxy/compat.h new file mode 100644 index 0000000..aa4f952 --- /dev/null +++ b/include/haproxy/compat.h @@ -0,0 +1,313 @@ +/* + * include/haproxy/compat.h + * Operating system compatibility interface. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_COMPAT_H +#define _HAPROXY_COMPAT_H + +#include <limits.h> +#include <unistd.h> +/* This is needed on Linux for Netfilter includes */ +#include <sys/param.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <arpa/inet.h> +#include <netinet/in.h> +#include <netinet/tcp.h> + + +/* These are a few short names for commonly used types whose size and sometimes + * signedness depends on the architecture. Be careful not to rely on a few + * common but wrong assumptions: + * - char is not always signed (ARM, AARCH64, PPC) + * - long is not always large enough for a pointer (Windows) + * These types are needed with the standard C API (string.h, printf, syscalls). + * + * When a fixed size is needed (protocol interoperability), better use the + * standard types provided by stdint.h: + * - size_t : unsigned int of default word size, large enough for any + * object in memory + * - ssize_t : signed int of default word size, used by some syscalls + * - uintptr_t : an unsigned int large enough to store any pointer + * - ptrdiff_t : a signed int large enough to hold a distance between 2 ptrs + * - int<size>_t : a signed int of <size> bits (8,16,32,64 work everywhere) + * - uint<size>_t : an unsigned int of <size> bits + */ +typedef signed char schar; +typedef unsigned char uchar; +typedef unsigned short ushort; +typedef unsigned int uint; +typedef unsigned long ulong; +typedef unsigned long long ullong; +typedef long long llong; + + +/* set any optional field in a struct to this type to save ifdefs. Its address + * will still be valid but it will not reserve any room nor require any + * initialization. + */ +typedef struct { } empty_t; + +// Redefine some limits that are not present everywhere +#ifndef LLONG_MAX +# define LLONG_MAX 9223372036854775807LL +# define LLONG_MIN (-LLONG_MAX - 1LL) +#endif + +#ifndef ULLONG_MAX +# define ULLONG_MAX (LLONG_MAX * 2ULL + 1) +#endif + +#ifndef LONGBITS +#define LONGBITS ((unsigned int)sizeof(long) * 8) +#endif + +#ifndef BITS_PER_INT +#define BITS_PER_INT (8*sizeof(int)) +#endif + +#ifndef __WORDSIZE +# if defined(__SIZEOF_LONG__) && __SIZEOF_LONG__ == 4 +# define __WORDSIZE 32 +# elif defined(__SIZEOF_LONG__) && __SIZEOF_LONG__ == 8 +# define __WORDSIZE 64 +# else +# error "Unknown machine word size (__WORDSIZE, __SIZEOF_LONG)" +# endif +#endif + +#ifndef MIN +#define MIN(a, b) (((a) < (b)) ? (a) : (b)) +#endif + +#ifndef MAX +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +#endif + +/* this is for libc5 for example */ +#ifndef TCP_NODELAY +#define TCP_NODELAY 1 +#endif + +#ifndef SHUT_RD +#define SHUT_RD 0 +#endif + +#ifndef SHUT_WR +#define SHUT_WR 1 +#endif + +/* only Linux defines it */ +#ifndef MSG_NOSIGNAL +#define MSG_NOSIGNAL 0 +#endif + +/* AIX does not define MSG_DONTWAIT. We'll define it to zero, and test it + * wherever appropriate. + */ +#ifndef MSG_DONTWAIT +#define MSG_DONTWAIT 0 +#endif + +/* Only Linux defines MSG_MORE */ +#ifndef MSG_MORE +#define MSG_MORE 0 +#endif + +/* On Linux 2.4 and above, MSG_TRUNC can be used on TCP sockets to drop any + * pending data. Let's rely on NETFILTER to detect if this is supported. + */ +#ifdef USE_NETFILTER +#define MSG_TRUNC_CLEARS_INPUT +#endif + +/* Maximum path length, OS-dependant */ +#ifndef MAXPATHLEN +#define MAXPATHLEN 128 +#endif + +/* longest UNIX socket name */ +#ifndef UNIX_MAX_PATH +#define UNIX_MAX_PATH 108 +#endif + +/* On Linux, allows pipes to be resized */ +#ifndef F_SETPIPE_SZ +#define F_SETPIPE_SZ (1024 + 7) +#endif + +#if defined(USE_TPROXY) && defined(USE_NETFILTER) +#include <linux/types.h> +#include <linux/netfilter_ipv6.h> +#include <linux/netfilter_ipv4.h> +#endif + +/* On Linux, IP_TRANSPARENT and/or IP_FREEBIND generally require a kernel patch */ +#if defined(USE_LINUX_TPROXY) +#if !defined(IP_FREEBIND) +#define IP_FREEBIND 15 +#endif /* !IP_FREEBIND */ +#if !defined(IP_TRANSPARENT) +#define IP_TRANSPARENT 19 +#endif /* !IP_TRANSPARENT */ +#if !defined(IPV6_TRANSPARENT) +#define IPV6_TRANSPARENT 75 +#endif /* !IPV6_TRANSPARENT */ +#endif /* USE_LINUX_TPROXY */ + +#if defined(IP_FREEBIND) \ + || defined(IP_BINDANY) \ + || defined(IPV6_BINDANY) \ + || defined(SO_BINDANY) \ + || defined(IP_TRANSPARENT) \ + || defined(IPV6_TRANSPARENT) +#define CONFIG_HAP_TRANSPARENT +#endif + +/* We'll try to enable SO_REUSEPORT on Linux 2.4 and 2.6 if not defined. + * There are two families of values depending on the architecture. Those + * are at least valid on Linux 2.4 and 2.6, reason why we'll rely on the + * USE_NETFILTER define. + */ +#if !defined(SO_REUSEPORT) && defined(USE_NETFILTER) +#if defined(SO_REUSEADDR) && (SO_REUSEADDR == 2) +#define SO_REUSEPORT 15 +#elif defined(SO_REUSEADDR) && (SO_REUSEADDR == 0x0004) +#define SO_REUSEPORT 0x0200 +#endif /* SO_REUSEADDR */ +#endif /* SO_REUSEPORT */ + +/* only Linux defines TCP_FASTOPEN */ +#ifdef USE_TFO +#ifndef TCP_FASTOPEN +#define TCP_FASTOPEN 23 +#endif + +#ifndef TCP_FASTOPEN_CONNECT +#define TCP_FASTOPEN_CONNECT 30 +#endif +#endif + +/* If IPv6 is supported, define IN6_IS_ADDR_V4MAPPED() if missing. */ +#if defined(IPV6_TCLASS) && !defined(IN6_IS_ADDR_V4MAPPED) +#define IN6_IS_ADDR_V4MAPPED(a) \ +((((const uint32_t *) (a))[0] == 0) \ +&& (((const uint32_t *) (a))[1] == 0) \ +&& (((const uint32_t *) (a))[2] == htonl (0xffff))) +#endif + +#if defined(__dietlibc__) +#include <strings.h> +#endif + +/* crypt_r() has been present in glibc since 2.2 and on FreeBSD since 12.0 + * (12000002). No other OS makes any mention of it for now. Feel free to add + * valid known combinations below if needed to relax the crypt() lock when + * using threads. + */ +#if (defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 2)) \ + || (defined(__FreeBSD__) && __FreeBSD_version >= 1200002) +#define HA_HAVE_CRYPT_R +#endif + +/* some backtrace() implementations are broken or incomplete, in this case we + * can replace them. We must not do it all the time as some are more accurate + * than ours. + */ +#ifdef USE_BACKTRACE +#if defined(__aarch64__) +/* on aarch64 at least from gcc-4.7.4 to 7.4.1 we only get a single entry, which + * is pointless. Ours works though it misses the faulty function itself, + * probably due to an alternate stack for the signal handler which does not + * create a new frame hence doesn't store the caller's return address. + */ +#elif defined(__clang__) && defined(__x86_64__) +/* this is on FreeBSD, clang 4.0 to 8.0 produce don't go further than the + * sighandler. + */ +#else +#define HA_HAVE_WORKING_BACKTRACE +#endif +#endif + +/* dl_iterate_phdr() is available in GLIBC 2.2.4 and up. Let's round up to 2.3.x */ +#if defined(USE_DL) && defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 3) +#define HA_HAVE_DL_ITERATE_PHDR +#define HA_HAVE_DUMP_LIBS +#endif + +/* malloc_trim() can be very convenient to reclaim unused memory especially + * from huge pattern files. It's available (and really usable) in glibc 2.8 and + * above. + */ +#if (defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 8)) +#include <malloc.h> +#define HA_HAVE_MALLOC_TRIM +#endif + +/* glibc 2.26 includes a thread-local cache which makes it fast enough in threads */ +#if (defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 26)) +#include <malloc.h> +#define HA_HAVE_FAST_MALLOC +#endif + +/* glibc 2.33 provides mallinfo2() that overcomes mallinfo()'s type limitations */ +#if (defined(__GNU_LIBRARY__) && (__GLIBC__ > 2 || __GLIBC__ == 2 && __GLIBC_MINOR__ >= 33)) +#include <malloc.h> +#define HA_HAVE_MALLINFO2 +#endif + +/* FreeBSD also has malloc_usable_size() but it requires malloc_np.h */ +#if defined(USE_MEMORY_PROFILING) && defined(__FreeBSD__) && (__FreeBSD_version >= 700002) +#include <malloc_np.h> +#endif + +/* macOS has a call similar to malloc_usable_size */ +#if defined(__APPLE__) +#include <malloc/malloc.h> +#define malloc_usable_size malloc_size +#define HA_HAVE_MALLOC_ZONE +#define TCP_KEEPIDLE TCP_KEEPALIVE +#define TCP_INFO TCP_CONNECTION_INFO +#define tcp_info tcp_connection_info +#endif + +/* Max number of file descriptors we send in one sendmsg(). Linux seems to be + * able to send 253 fds per sendmsg(), however musl is limited to 252, not sure + * about the other OSes. + */ +#define MAX_SEND_FD 252 + +/* Some bsd kernels (ie: FreeBSD) offer the FAST clock source as equivalent + * to Linux COARSE clock source. Aliasing COARSE to FAST on such systems when + * COARSE is not already defined. + */ +#if !defined(CLOCK_MONOTONIC_COARSE) && defined(CLOCK_MONOTONIC_FAST) +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC_FAST +#endif + +#endif /* _HAPROXY_COMPAT_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/compiler.h b/include/haproxy/compiler.h new file mode 100644 index 0000000..d8e8a72 --- /dev/null +++ b/include/haproxy/compiler.h @@ -0,0 +1,469 @@ +/* + * include/haproxy/compiler.h + * This files contains some compiler-specific settings. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_COMPILER_H +#define _HAPROXY_COMPILER_H + +/* leave a chance to the compiler to bring its own definitions first; this + * will cause cdefs.h to be included on systems which have it. + */ +#include <inttypes.h> + +#ifdef DEBUG_USE_ABORT +#include <stdlib.h> +#endif + +/* + * Gcc before 3.0 needs [0] to declare a variable-size array + */ +#ifndef VAR_ARRAY +#if defined(__GNUC__) && (__GNUC__ < 3) +#define VAR_ARRAY 0 +#else +#define VAR_ARRAY +#endif +#endif + +/* This is used to test if a macro is defined and equals 1. The principle is + * that the macro is passed as a value and its value concatenated to the word + * "comma_for_one" to form a new macro name. The macro "comma_for_one1" equals + * one comma, which, once used in an argument, will shift all of them by one, + * so that we can use this to concatenate both a 1 and a 0 and always pick the + * second one. + */ +#define comma_for_one1 , +#define _____equals_1(x, y, ...) (y) +#define ____equals_1(x, ...) _____equals_1(x, 0) +#define ___equals_1(x) ____equals_1(comma_for_one ## x 1) +#define __equals_1(x) ___equals_1(x) + +/* gcc 5 and clang 3 brought __has_attribute(), which is not well documented in + * the case of gcc, but is convenient since handled at the preprocessor level. + * In both cases it's possible to test for __has_attribute() using ifdef. When + * not defined we remap this to the __has_attribute_<name> macro so that we'll + * later be able to implement on a per-compiler basis those which are missing, + * by defining __has_attribute_<name> to 1. + */ +#ifndef __has_attribute +#define __has_attribute(x) __equals_1(__has_attribute_ ## x) +#endif + +/* The fallthrough attribute arrived with gcc 7, the same version that started + * to emit the fallthrough warnings and to parse the comments. Comments do not + * manage to stop the warning when preprocessing is split from compiling (e.g. + * when building under distcc). Better encourage the use of a __fallthrough + * statement instead. There are still limitations in that clang doesn't accept + * it after a label; this is the reason why we're always preceding it with an + * empty do-while. + */ +#if __has_attribute(fallthrough) +# define __fallthrough do { } while (0); __attribute__((fallthrough)) +#else +# define __fallthrough do { } while (0) +#endif + +#if !defined(__GNUC__) +/* Some versions of glibc irresponsibly redefine __attribute__() to empty for + * non-gcc compilers, and as such, silently break all constructors with other + * other compilers. Let's make sure such incompatibilities are detected if any, + * or that the attribute is properly enforced. + */ +#undef __attribute__ +#define __attribute__(x) __attribute__(x) +#endif + +/* attribute(warning) was added in gcc 4.3 */ +#if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +# define __has_attribute_warning 1 +#endif + +/* __attribute__warning(x) does __attribute__((warning(x))) if supported by the + * compiler, otherwise __attribute__((deprecated)). Clang supports it since v14 + * but is a bit capricious in that it refuses a redefinition with a warning + * attribute that wasn't there the first time. However it's OK with deprecated(x) + * so better use this one. See: https://github.com/llvm/llvm-project/issues/56519 + */ +#if defined(__clang__) +# define __attribute__warning(x) __attribute__((deprecated(x))) +#elif __has_attribute(warning) +# define __attribute__warning(x) __attribute__((warning(x))) +#else +# define __attribute__warning(x) __attribute__((deprecated)) +#endif + +/* By default, gcc does not inline large chunks of code, but we want it to + * respect our choices. + */ +#if !defined(forceinline) +#if !defined(__GNUC__) || (__GNUC__ < 3) +#define forceinline inline +#else +#define forceinline inline __attribute__((always_inline)) +#endif +#endif + +#ifndef __maybe_unused +/* silence the "unused" warnings without having to place painful #ifdefs. + * For use with variables or functions. + */ +#define __maybe_unused __attribute__((unused)) +#endif + +/* TCC doesn't support weak attribute, sections etc and needs the more portable + * obsolete linker model instead. + */ +#if defined(__TINYC__) && !defined(USE_OBSOLETE_LINKER) +#define USE_OBSOLETE_LINKER 1 +#endif + +/* These macros are used to declare a section name for a variable. + * WARNING: keep section names short, as MacOS limits them to 16 characters. + * The _START and _STOP attributes have to be placed after the start and stop + * weak symbol declarations, and are only used by MacOS. + */ +#if !defined(USE_OBSOLETE_LINKER) + +#ifdef __APPLE__ +#define HA_SECTION(s) __attribute__((__section__("__DATA, " s))) +#define HA_SECTION_START(s) __asm("section$start$__DATA$" s) +#define HA_SECTION_STOP(s) __asm("section$end$__DATA$" s) +#else +#define HA_SECTION(s) __attribute__((__section__(s))) +#define HA_SECTION_START(s) +#define HA_SECTION_STOP(s) +#endif + +#else // obsolete linker below, let's just not force any section + +#define HA_SECTION(s) +#define HA_SECTION_START(s) +#define HA_SECTION_STOP(s) + +#endif // USE_OBSOLETE_LINKER + +/* Declare a symbol as weak if possible, otherwise global. Since we don't want to + * error on multiple definitions, the symbol is declared weak. On MacOS ".weak" + * does not exist and we must continue to use ".globl" instead. Note that + * ".global" is to be avoided on other platforms as llvm complains about it + * being used for symbols declared as weak elsewhere in the code. It may or may + * not work depending on linkers and assemblers, this is only for advanced use + * anyway (and most likely it will only work with !USE_OBSOLETE_LINKER). + */ +#if defined(__APPLE__) +# define __HA_WEAK(sym) __asm__(".globl " #sym) +#else +# define __HA_WEAK(sym) __asm__(".weak " #sym) +#endif +#define HA_WEAK(sym) __HA_WEAK(sym) + +/* declare a symbol as global */ +#define __HA_GLOBL(sym) __asm__(".globl " #sym) +#define HA_GLOBL(sym) __HA_GLOBL(sym) + +/* use this attribute on a variable to move it to the read_mostly section */ +#if !defined(__read_mostly) +#define __read_mostly HA_SECTION("read_mostly") +#endif + +/* This allows gcc to know that some locations are never reached, for example + * after a longjmp() in the Lua code, hence that some errors caught by such + * methods cannot propagate further. This is important with gcc versions 6 and + * above which can more aggressively detect null dereferences. The builtin + * below was introduced in gcc 4.5, and before it we didn't care. + */ +#ifdef DEBUG_USE_ABORT +#define my_unreachable() abort() +#else +#if defined(__GNUC__) && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)) +#define my_unreachable() __builtin_unreachable() +#else +#define my_unreachable() do { } while (1) +#endif +#endif + +/* This prevents the compiler from folding multiple identical code paths into a + * single one, by adding a dependency on the line number in the path. This may + * typically happen on function tails, or purposely placed abort() before an + * unreachable() statement, due to the compiler performing an Identical Code + * Folding optimization. This macro is aimed at helping with code tracing in + * crash dumps and may also be used for specific optimizations. One known case + * is gcc-4.7 and 4.8 which aggressively fold multiple ABORT_NOW() exit points + * and which causes wrong line numbers to be reported by the debugger (note + * that even newer compilers do this when using abort()). Please keep in mind + * that nothing prevents the compiler from folding the code after that point, + * but at least it will not fold the code before. + */ +#define DO_NOT_FOLD() do { asm volatile("" :: "i"(__LINE__)); } while (0) + +/* This macro may be used to block constant propagation that lets the compiler + * detect a possible NULL dereference on a variable resulting from an explicit + * assignment in an impossible check. Sometimes a function is called which does + * safety checks and returns NULL if safe conditions are not met. The place + * where it's called cannot hit this condition and dereferencing the pointer + * without first checking it will make the compiler emit a warning about a + * "potential null pointer dereference" which is hard to work around. This + * macro "washes" the pointer and prevents the compiler from emitting tests + * branching to undefined instructions. It may only be used when the developer + * is absolutely certain that the conditions are guaranteed and that the + * pointer passed in argument cannot be NULL by design. + */ +#define ALREADY_CHECKED(p) do { asm("" : "=rm"(p) : "0"(p)); } while (0) + +/* same as above but to be used to pass the input value to the output but + * without letting the compiler know about its initial properties. + */ +#define DISGUISE(v) ({ typeof(v) __v = (v); ALREADY_CHECKED(__v); __v; }) + +/* Implements a static event counter where it's used. This is typically made to + * report some warnings only once, either during boot or at runtime. It only + * returns true on the very first call, and zero later. It's thread-safe and + * uses a single byte of memory per call place. It relies on the atomic xchg + * defined in atomic.h which is also part of the common API. + */ +#define ONLY_ONCE() ({ static char __cnt; !_HA_ATOMIC_XCHG(&__cnt, 1); }) + +/* makes a string from a constant (number or macro), avoids the need for + * printf("%d") format just to dump a setting limit or value in an error + * message. We use two levels so that macros are resolved. + */ +#define _TOSTR(x) #x +#define TOSTR(x) _TOSTR(x) + +/* + * Gcc >= 3 provides the ability for the program to give hints to the + * compiler about what branch of an if is most likely to be taken. This + * helps the compiler produce the most compact critical paths, which is + * generally better for the cache and to reduce the number of jumps. + */ +#if !defined(likely) +#if !defined(__GNUC__) || (__GNUC__ < 3) +#define __builtin_expect(x,y) (x) +#define likely(x) (x) +#define unlikely(x) (x) +#else +#define likely(x) (__builtin_expect((x) != 0, 1)) +#define unlikely(x) (__builtin_expect((x) != 0, 0)) +#endif +#endif + +#ifndef __GNUC_PREREQ__ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#define __GNUC_PREREQ__(ma, mi) \ + (__GNUC__ > (ma) || __GNUC__ == (ma) && __GNUC_MINOR__ >= (mi)) +#else +#define __GNUC_PREREQ__(ma, mi) 0 +#endif +#endif + +#ifndef offsetof +#if __GNUC_PREREQ__(4, 1) +#define offsetof(type, field) __builtin_offsetof(type, field) +#else +#define offsetof(type, field) \ + ((size_t)(uintptr_t)((const volatile void *)&((type *)0)->field)) +#endif +#endif + +/* Linux-like "container_of". It returns a pointer to the structure of type + * <type> which has its member <name> stored at address <ptr>. + */ +#ifndef container_of +#define container_of(ptr, type, name) ((type *)(((void *)(ptr)) - ((long)&((type *)0)->name))) +#endif + +/* returns a pointer to the structure of type <type> which has its member <name> + * stored at address <ptr>, unless <ptr> is 0, in which case 0 is returned. + */ +#ifndef container_of_safe +#define container_of_safe(ptr, type, name) \ + ({ void *__p = (ptr); \ + __p ? (type *)(__p - ((long)&((type *)0)->name)) : (type *)0; \ + }) +#endif + + +/* From gcc 6 and above, enum values may have attributes */ +#if __GNUC_PREREQ__(6, 0) +#define ENUM_ATTRIBUTE(x) __attribute__(x) +#else +#define ENUM_ATTRIBUTE(x) +#endif + +/* Some architectures have a double-word CAS, sometimes even dual-8 bytes. + * Some architectures support unaligned accesses, others are fine with them + * but only for non-atomic operations. Also mention those supporting unaligned + * accesses and being little endian, and those where unaligned accesses are + * known to be fast (almost as fast as aligned ones). + */ +#if defined(__x86_64__) +#define HA_UNALIGNED +#define HA_UNALIGNED_LE +#define HA_UNALIGNED_LE64 +#define HA_UNALIGNED_FAST +#define HA_UNALIGNED_ATOMIC +#define HA_HAVE_CAS_DW +#define HA_CAS_IS_8B +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +#define HA_UNALIGNED +#define HA_UNALIGNED_LE +#define HA_UNALIGNED_ATOMIC +#elif defined (__aarch64__) || defined(__ARM_ARCH_8A) +#define HA_UNALIGNED +#define HA_UNALIGNED_LE +#define HA_UNALIGNED_LE64 +#define HA_UNALIGNED_FAST +#define HA_HAVE_CAS_DW +#define HA_CAS_IS_8B +#elif defined(__arm__) && (defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)) +#define HA_UNALIGNED +#define HA_UNALIGNED_LE +#define HA_UNALIGNED_FAST +#define HA_HAVE_CAS_DW +#endif + +/*********************** IMPORTANT NOTE ABOUT ALIGNMENT **********************\ + * Alignment works fine for variables. It also works on types and struct * + * members by propagating the alignment to the container struct itself, * + * but this requires that variables of the affected type are properly * + * aligned themselves. While regular variables will always abide, those * + * allocated using malloc() will not! Most platforms provide posix_memalign()* + * for this, but it's not available everywhere. As such one ought not to use * + * these alignment declarations inside structures that are dynamically * + * allocated. If the purpose is only to avoid false sharing of cache lines * + * for multi_threading, see THREAD_PAD() below. * +\*****************************************************************************/ + +/* sets alignment for current field or variable */ +#ifndef ALIGNED +#define ALIGNED(x) __attribute__((aligned(x))) +#endif + +/* sets alignment only on architectures preventing unaligned atomic accesses */ +#ifndef MAYBE_ALIGNED +#ifndef HA_UNALIGNED +#define MAYBE_ALIGNED(x) ALIGNED(x) +#else +#define MAYBE_ALIGNED(x) +#endif +#endif + +/* sets alignment only on architectures preventing unaligned atomic accesses */ +#ifndef ATOMIC_ALIGNED +#ifndef HA_UNALIGNED_ATOMIC +#define ATOMIC_ALIGNED(x) ALIGNED(x) +#else +#define ATOMIC_ALIGNED(x) +#endif +#endif + +/* sets alignment for current field or variable only when threads are enabled. + * Typically used to respect cache line alignment to avoid false sharing. + */ +#ifndef THREAD_ALIGNED +#ifdef USE_THREAD +#define THREAD_ALIGNED(x) __attribute__((aligned(x))) +#else +#define THREAD_ALIGNED(x) +#endif +#endif + +/* add a mandatory alignment for next fields in a structure */ +#ifndef ALWAYS_ALIGN +#define ALWAYS_ALIGN(x) union { } ALIGNED(x) +#endif + +/* add an optional alignment for next fields in a structure, only for archs + * which do not support unaligned accesses. + */ +#ifndef MAYBE_ALIGN +#ifndef HA_UNALIGNED +#define MAYBE_ALIGN(x) union { } ALIGNED(x) +#else +#define MAYBE_ALIGN(x) +#endif +#endif + +/* add an optional alignment for next fields in a structure, only for archs + * which do not support unaligned accesses for atomic operations. + */ +#ifndef ATOMIC_ALIGN +#ifndef HA_UNALIGNED_ATOMIC +#define ATOMIC_ALIGN(x) union { } ALIGNED(x) +#else +#define ATOMIC_ALIGN(x) +#endif +#endif + +/* add an optional alignment for next fields in a structure, only when threads + * are enabled. Typically used to respect cache line alignment to avoid false + * sharing. + */ +#ifndef THREAD_ALIGN +#ifdef USE_THREAD +#define THREAD_ALIGN(x) union { } ALIGNED(x) +#else +#define THREAD_ALIGN(x) +#endif +#endif + +/* add optional padding of the specified size between fields in a structure, + * only when threads are enabled. This is used to avoid false sharing of cache + * lines for dynamically allocated structures which cannot guarantee alignment. + */ +#ifndef THREAD_PAD +# ifdef USE_THREAD +# define __THREAD_PAD(x,l) char __pad_##l[x] +# define _THREAD_PAD(x,l) __THREAD_PAD(x, l) +# define THREAD_PAD(x) _THREAD_PAD(x, __LINE__) +# else +# define THREAD_PAD(x) +# endif +#endif + +/* The THREAD_LOCAL type attribute defines thread-local storage and is defined + * to __thread when threads are enabled or empty when disabled. + */ +#ifdef USE_THREAD +#define THREAD_LOCAL __thread +#else +#define THREAD_LOCAL +#endif + +/* The __decl_thread() statement is shows the argument when threads are enabled + * or hides it when disabled. The purpose is to condition the presence of some + * variables or struct members to the fact that threads are enabled, without + * having to enclose them inside a #ifdef USE_THREAD/#endif clause. + */ +#ifdef USE_THREAD +#define __decl_thread(decl) decl +#else +#define __decl_thread(decl) +#endif + +/* clang has a __has_feature() macro which reports true/false on a number of + * internally supported features. Let's make sure this macro is always defined + * and returns zero when not supported. + */ +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + +#endif /* _HAPROXY_COMPILER_H */ diff --git a/include/haproxy/compression-t.h b/include/haproxy/compression-t.h new file mode 100644 index 0000000..b8f118b --- /dev/null +++ b/include/haproxy/compression-t.h @@ -0,0 +1,109 @@ +/* + * include/haproxy/compression-t.h + * This file defines everything related to compression. + * + * Copyright 2012 Exceliance, David Du Colombier <dducolombier@exceliance.fr> + William Lallemand <wlallemand@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_COMP_T_H +#define _HAPROXY_COMP_T_H + +#if defined(USE_SLZ) +#ifdef USE_ZLIB +#error "Cannot build with both USE_SLZ and USE_ZLIB at the same time." +#endif +#include <import/slz.h> +#elif defined(USE_ZLIB) +#include <zlib.h> +#endif + +#include <haproxy/buf-t.h> + +/* Direction index */ + +#define COMP_DIR_REQ 0 +#define COMP_DIR_RES 1 + +/* Compression flags */ + +#define COMP_FL_OFFLOAD 0x00000001 /* Compression offload */ +#define COMP_FL_DIR_REQ 0x00000002 /* Compress requests */ +#define COMP_FL_DIR_RES 0x00000004 /* Compress responses */ + +struct comp { + struct comp_algo *algos_res; /* Algos available for response */ + struct comp_algo *algo_req; /* Algo to use for request */ + struct comp_type *types_req; /* Types to be compressed for requests */ + struct comp_type *types_res; /* Types to be compressed for responses */ + unsigned int flags; +}; + +struct comp_ctx { +#if defined(USE_SLZ) + struct slz_stream strm; + const void *direct_ptr; /* NULL or pointer to beginning of data */ + int direct_len; /* length of direct_ptr if not NULL */ + struct buffer queued; /* if not NULL, data already queued */ +#elif defined(USE_ZLIB) + z_stream strm; /* zlib stream */ + void *zlib_deflate_state; + void *zlib_window; + void *zlib_prev; + void *zlib_pending_buf; + void *zlib_head; +#endif + int cur_lvl; +}; + +/* Thanks to MSIE/IIS, the "deflate" name is ambiguous, as according to the RFC + * it's a zlib-wrapped deflate stream, but MSIE only understands a raw deflate + * stream. For this reason some people prefer to emit a raw deflate stream on + * "deflate" and we'll need two algos for the same name, they are distinguished + * with the config name. + */ +struct comp_algo { + char *cfg_name; /* config name */ + int cfg_name_len; + + char *ua_name; /* name for the user-agent */ + int ua_name_len; + + int (*init)(struct comp_ctx **comp_ctx, int level); + int (*add_data)(struct comp_ctx *comp_ctx, const char *in_data, int in_len, struct buffer *out); + int (*flush)(struct comp_ctx *comp_ctx, struct buffer *out); + int (*finish)(struct comp_ctx *comp_ctx, struct buffer *out); + int (*end)(struct comp_ctx **comp_ctx); + struct comp_algo *next; +}; + +struct comp_type { + char *name; + int name_len; + struct comp_type *next; +}; + + +#endif /* _HAPROXY_COMP_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ + diff --git a/include/haproxy/compression.h b/include/haproxy/compression.h new file mode 100644 index 0000000..851ea23 --- /dev/null +++ b/include/haproxy/compression.h @@ -0,0 +1,44 @@ +/* + * include/haproxy/compression.h + * This file defines function prototypes for compression. + * + * Copyright 2012 (C) Exceliance, David Du Colombier <dducolombier@exceliance.fr> + * William Lallemand <wlallemand@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_COMP_H +#define _HAPROXY_COMP_H + +#include <haproxy/compression-t.h> + +extern unsigned int compress_min_idle; + +int comp_append_type(struct comp_type **types, const char *type); +int comp_append_algo(struct comp_algo **algos, const char *algo); + +#ifdef USE_ZLIB +extern long zlib_used_memory; +#endif /* USE_ZLIB */ + +#endif /* _HAPROXY_COMP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/connection-t.h b/include/haproxy/connection-t.h new file mode 100644 index 0000000..2619fd6 --- /dev/null +++ b/include/haproxy/connection-t.h @@ -0,0 +1,722 @@ +/* + * include/haproxy/connection-t.h + * This file describes the connection struct and associated constants. + * + * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CONNECTION_T_H +#define _HAPROXY_CONNECTION_T_H + +#include <stdlib.h> +#include <sys/socket.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip6.h> + +#include <import/ebtree-t.h> +#include <import/ist.h> + +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/port_range-t.h> +#include <haproxy/protocol-t.h> +#include <haproxy/show_flags-t.h> +#include <haproxy/thread-t.h> + +/* referenced below */ +struct connection; +struct stconn; +struct sedesc; +struct cs_info; +struct buffer; +struct proxy; +struct server; +struct session; +struct pipe; +struct quic_conn; +struct bind_conf; +struct qcs; +struct ssl_sock_ctx; + +/* Note: subscribing to these events is only valid after the caller has really + * attempted to perform the operation, and failed to proceed or complete. + */ +enum sub_event_type { + SUB_RETRY_RECV = 0x00000001, /* Schedule the tasklet when we can attempt to recv again */ + SUB_RETRY_SEND = 0x00000002, /* Schedule the tasklet when we can attempt to send again */ +}; + +/* For each direction, we have a CO_FL_XPRT_<DIR>_ENA flag, which + * indicates if read or write is desired in that direction for the respective + * layers. The current status corresponding to the current layer being used is + * remembered in the CO_FL_XPRT_<DIR>_ENA flag. The need to poll (ie receipt of + * EAGAIN) is remembered at the file descriptor level so that even when the + * activity is stopped and restarted, we still remember whether it was needed + * to poll before attempting the I/O. + * + * The FD state is updated according to CO_FL_XPRT_<DIR>_ENA in + * conn_cond_update_polling(). + */ + +/* flags for use in connection->flags. Please also update the conn_show_flags() + * function below in case of changes. + */ +enum { + CO_FL_NONE = 0x00000000, /* Just for initialization purposes */ + + /* Do not change these values without updating conn_*_poll_changes() ! */ + CO_FL_SAFE_LIST = 0x00000001, /* 0 = not in any list, 1 = in safe_list */ + CO_FL_IDLE_LIST = 0x00000002, /* 2 = in idle_list, 3 = invalid */ + CO_FL_LIST_MASK = 0x00000003, /* Is the connection in any server-managed list ? */ + + CO_FL_REVERSED = 0x00000004, /* connection has been reversed to backend / reversed and accepted on frontend */ + CO_FL_ACT_REVERSING = 0x00000008, /* connection has been reversed to frontend but not yet accepted */ + /* unused : 0x00000008 */ + + /* unused : 0x00000010 */ + /* unused : 0x00000020 */ + /* unused : 0x00000040, 0x00000080 */ + + /* These flags indicate whether the Control and Transport layers are initialized */ + CO_FL_CTRL_READY = 0x00000100, /* FD was registered, fd_delete() needed */ + CO_FL_XPRT_READY = 0x00000200, /* xprt_start() done, xprt can be used */ + + CO_FL_WANT_DRAIN = 0x00000400, /* try to drain pending data when closing */ + + /* This flag is used by data layers to indicate they had to stop + * receiving data because a buffer was full. The connection handler + * clears it before first calling the I/O and data callbacks. + */ + CO_FL_WAIT_ROOM = 0x00000800, /* data sink is full */ + + /* These flags are used to report whether the from/to addresses are set or not */ + /* unused: 0x00001000 */ + /* unused: 0x00002000 */ + + CO_FL_EARLY_SSL_HS = 0x00004000, /* We have early data pending, don't start SSL handshake yet */ + CO_FL_EARLY_DATA = 0x00008000, /* At least some of the data are early data */ + CO_FL_SOCKS4_SEND = 0x00010000, /* handshaking with upstream SOCKS4 proxy, going to send the handshake */ + CO_FL_SOCKS4_RECV = 0x00020000, /* handshaking with upstream SOCKS4 proxy, going to check if handshake succeed */ + + /* flags used to remember what shutdown have been performed/reported */ + CO_FL_SOCK_RD_SH = 0x00040000, /* SOCK layer was notified about shutr/read0 */ + CO_FL_SOCK_WR_SH = 0x00080000, /* SOCK layer asked for shutw */ + + /* flags used to report connection errors or other closing conditions */ + CO_FL_ERROR = 0x00100000, /* a fatal error was reported */ + CO_FL_NOTIFY_DONE = 0x001C0000, /* any xprt shut/error flags above needs to be reported */ + + CO_FL_FDLESS = 0x00200000, /* this connection doesn't use any FD (e.g. QUIC) */ + + /* flags used to report connection status updates */ + CO_FL_WAIT_L4_CONN = 0x00400000, /* waiting for L4 to be connected */ + CO_FL_WAIT_L6_CONN = 0x00800000, /* waiting for L6 to be connected (eg: SSL) */ + CO_FL_WAIT_L4L6 = 0x00C00000, /* waiting for L4 and/or L6 to be connected */ + + /* All the flags below are used for connection handshakes. Any new + * handshake should be added after this point, and CO_FL_HANDSHAKE + * should be updated. + */ + CO_FL_SEND_PROXY = 0x01000000, /* send a valid PROXY protocol header */ + CO_FL_ACCEPT_PROXY = 0x02000000, /* receive a valid PROXY protocol header */ + CO_FL_ACCEPT_CIP = 0x04000000, /* receive a valid NetScaler Client IP header */ + + /* below we have all handshake flags grouped into one */ + CO_FL_HANDSHAKE = CO_FL_SEND_PROXY | CO_FL_ACCEPT_PROXY | CO_FL_ACCEPT_CIP | CO_FL_SOCKS4_SEND | CO_FL_SOCKS4_RECV, + CO_FL_WAIT_XPRT = CO_FL_WAIT_L4_CONN | CO_FL_HANDSHAKE | CO_FL_WAIT_L6_CONN, + + CO_FL_SSL_WAIT_HS = 0x08000000, /* wait for an SSL handshake to complete */ + + /* This connection may not be shared between clients */ + CO_FL_PRIVATE = 0x10000000, + + /* This flag is used to know that a PROXY protocol header was sent by the client */ + CO_FL_RCVD_PROXY = 0x20000000, + + /* The connection is unused by its owner */ + CO_FL_SESS_IDLE = 0x40000000, + + /* This last flag indicates that the transport layer is used (for instance + * by logs) and must not be cleared yet. The last call to conn_xprt_close() + * must be done after clearing this flag. + */ + CO_FL_XPRT_TRACKED = 0x80000000, + + /* below we have all SOCKS handshake flags grouped into one */ + CO_FL_SOCKS4 = CO_FL_SOCKS4_SEND | CO_FL_SOCKS4_RECV, +}; + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *conn_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(CO_FL_SAFE_LIST, _(CO_FL_IDLE_LIST, _(CO_FL_CTRL_READY, + _(CO_FL_REVERSED, _(CO_FL_ACT_REVERSING, _(CO_FL_XPRT_READY, + _(CO_FL_WANT_DRAIN, _(CO_FL_WAIT_ROOM, _(CO_FL_EARLY_SSL_HS, _(CO_FL_EARLY_DATA, + _(CO_FL_SOCKS4_SEND, _(CO_FL_SOCKS4_RECV, _(CO_FL_SOCK_RD_SH, _(CO_FL_SOCK_WR_SH, + _(CO_FL_ERROR, _(CO_FL_FDLESS, _(CO_FL_WAIT_L4_CONN, _(CO_FL_WAIT_L6_CONN, + _(CO_FL_SEND_PROXY, _(CO_FL_ACCEPT_PROXY, _(CO_FL_ACCEPT_CIP, _(CO_FL_SSL_WAIT_HS, + _(CO_FL_PRIVATE, _(CO_FL_RCVD_PROXY, _(CO_FL_SESS_IDLE, _(CO_FL_XPRT_TRACKED + )))))))))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* Possible connection error codes. + * Warning: Do not reorder the codes, they are fetchable through the + * "fc_err" sample fetch. If a new code is added, please add an error label + * in conn_err_code_str and in the "fc_err_str" sample fetch documentation. + */ +enum { + CO_ER_NONE, /* no error */ + + CO_ER_CONF_FDLIM, /* reached process's configured FD limitation */ + CO_ER_PROC_FDLIM, /* reached process's FD limitation */ + CO_ER_SYS_FDLIM, /* reached system's FD limitation */ + CO_ER_SYS_MEMLIM, /* reached system buffers limitation */ + CO_ER_NOPROTO, /* protocol not supported */ + CO_ER_SOCK_ERR, /* other socket error */ + + CO_ER_PORT_RANGE, /* source port range exhausted */ + CO_ER_CANT_BIND, /* can't bind to source address */ + CO_ER_FREE_PORTS, /* no more free ports on the system */ + CO_ER_ADDR_INUSE, /* local address already in use */ + + CO_ER_PRX_EMPTY, /* nothing received in PROXY protocol header */ + CO_ER_PRX_ABORT, /* client abort during PROXY protocol header */ + CO_ER_PRX_TIMEOUT, /* timeout while waiting for a PROXY header */ + CO_ER_PRX_TRUNCATED, /* truncated PROXY protocol header */ + CO_ER_PRX_NOT_HDR, /* not a PROXY protocol header */ + CO_ER_PRX_BAD_HDR, /* bad PROXY protocol header */ + CO_ER_PRX_BAD_PROTO, /* unsupported protocol in PROXY header */ + + CO_ER_CIP_EMPTY, /* nothing received in NetScaler Client IP header */ + CO_ER_CIP_ABORT, /* client abort during NetScaler Client IP header */ + CO_ER_CIP_TIMEOUT, /* timeout while waiting for a NetScaler Client IP header */ + CO_ER_CIP_TRUNCATED, /* truncated NetScaler Client IP header */ + CO_ER_CIP_BAD_MAGIC, /* bad magic number in NetScaler Client IP header */ + CO_ER_CIP_BAD_PROTO, /* unsupported protocol in NetScaler Client IP header */ + + CO_ER_SSL_EMPTY, /* client closed during SSL handshake */ + CO_ER_SSL_ABORT, /* client abort during SSL handshake */ + CO_ER_SSL_TIMEOUT, /* timeout during SSL handshake */ + CO_ER_SSL_TOO_MANY, /* too many SSL connections */ + CO_ER_SSL_NO_MEM, /* no more memory to allocate an SSL connection */ + CO_ER_SSL_RENEG, /* forbidden client renegotiation */ + CO_ER_SSL_CA_FAIL, /* client cert verification failed in the CA chain */ + CO_ER_SSL_CRT_FAIL, /* client cert verification failed on the certificate */ + CO_ER_SSL_MISMATCH, /* Server presented an SSL certificate different from the configured one */ + CO_ER_SSL_MISMATCH_SNI, /* Server presented an SSL certificate different from the expected one */ + CO_ER_SSL_HANDSHAKE, /* SSL error during handshake */ + CO_ER_SSL_HANDSHAKE_HB, /* SSL error during handshake with heartbeat present */ + CO_ER_SSL_KILLED_HB, /* Stopped a TLSv1 heartbeat attack (CVE-2014-0160) */ + CO_ER_SSL_NO_TARGET, /* unknown target (not client nor server) */ + CO_ER_SSL_EARLY_FAILED, /* Server refused early data */ + + CO_ER_SOCKS4_SEND, /* SOCKS4 Proxy write error during handshake */ + CO_ER_SOCKS4_RECV, /* SOCKS4 Proxy read error during handshake */ + CO_ER_SOCKS4_DENY, /* SOCKS4 Proxy deny the request */ + CO_ER_SOCKS4_ABORT, /* SOCKS4 Proxy handshake aborted by server */ + + CO_ERR_SSL_FATAL, /* SSL fatal error during a SSL_read or SSL_write */ + + CO_ER_REVERSE, /* Error during reverse connect */ +}; + +/* error return codes for accept_conn() */ +enum { + CO_AC_NONE = 0, /* no error, valid connection returned */ + CO_AC_DONE, /* reached the end of the queue (typically EAGAIN) */ + CO_AC_RETRY, /* late signal delivery or anything requiring the caller to try again */ + CO_AC_YIELD, /* short-lived limitation that requires a short pause */ + CO_AC_PAUSE, /* long-lived issue (resource/memory allocation error, paused FD) */ + CO_AC_PERMERR, /* permanent, non-recoverable error (e.g. closed listener socket) */ +}; + +/* source address settings for outgoing connections */ +enum { + /* Tproxy exclusive values from 0 to 7 */ + CO_SRC_TPROXY_ADDR = 0x0001, /* bind to this non-local address when connecting */ + CO_SRC_TPROXY_CIP = 0x0002, /* bind to the client's IP address when connecting */ + CO_SRC_TPROXY_CLI = 0x0003, /* bind to the client's IP+port when connecting */ + CO_SRC_TPROXY_DYN = 0x0004, /* bind to a dynamically computed non-local address */ + CO_SRC_TPROXY_MASK = 0x0007, /* bind to a non-local address when connecting */ + + CO_SRC_BIND = 0x0008, /* bind to a specific source address when connecting */ +}; + +/* flags that can be passed to xprt->rcv_buf() and mux->rcv_buf() */ +enum { + CO_RFL_BUF_WET = 0x0001, /* Buffer still has some output data present */ + CO_RFL_BUF_FLUSH = 0x0002, /* Flush mux's buffers but don't read more data */ + CO_RFL_READ_ONCE = 0x0004, /* don't loop even if the request/response is small */ + CO_RFL_KEEP_RECV = 0x0008, /* Instruct the mux to still wait for read events */ + CO_RFL_BUF_NOT_STUCK = 0x0010, /* Buffer is not stuck. Optims are possible during data copy */ + CO_RFL_MAY_SPLICE = 0x0020, /* The producer can use the kernel splicing */ +}; + +/* flags that can be passed to xprt->snd_buf() and mux->snd_buf() */ +enum { + CO_SFL_MSG_MORE = 0x0001, /* More data to come afterwards */ + CO_SFL_STREAMER = 0x0002, /* Producer is continuously streaming data */ +}; + +/* mux->shutr() modes */ +enum co_shr_mode { + CO_SHR_DRAIN = 0, /* read shutdown, drain any extra stuff */ + CO_SHR_RESET = 1, /* read shutdown, reset any extra stuff */ +}; + +/* mux->shutw() modes */ +enum co_shw_mode { + CO_SHW_NORMAL = 0, /* regular write shutdown */ + CO_SHW_SILENT = 1, /* imminent close, don't notify peer */ +}; + +/* known transport layers (for ease of lookup) */ +enum { + XPRT_RAW = 0, + XPRT_SSL = 1, + XPRT_HANDSHAKE = 2, + XPRT_QUIC = 3, + XPRT_ENTRIES /* must be last one */ +}; + +/* MUX-specific flags */ +enum { + MX_FL_NONE = 0x00000000, + MX_FL_HTX = 0x00000001, /* set if it is an HTX multiplexer */ + MX_FL_HOL_RISK = 0x00000002, /* set if the protocol is subject the to head-of-line blocking on server */ + MX_FL_NO_UPG = 0x00000004, /* set if mux does not support any upgrade */ + MX_FL_FRAMED = 0x00000008, /* mux working on top of a framed transport layer (QUIC) */ + MX_FL_REVERSABLE = 0x00000010, /* mux supports connection reversal */ +}; + +/* PROTO token registration */ +enum proto_proxy_mode { + PROTO_MODE_NONE = 0, + PROTO_MODE_TCP = 1 << 0, // must not be changed! + PROTO_MODE_HTTP = 1 << 1, // must not be changed! + PROTO_MODE_ANY = PROTO_MODE_TCP | PROTO_MODE_HTTP, +}; + +enum proto_proxy_side { + PROTO_SIDE_NONE = 0, + PROTO_SIDE_FE = 1, // same as PR_CAP_FE + PROTO_SIDE_BE = 2, // same as PR_CAP_BE + PROTO_SIDE_BOTH = PROTO_SIDE_FE | PROTO_SIDE_BE, +}; + +/* ctl command used by mux->ctl() */ +enum mux_ctl_type { + MUX_CTL_STATUS, /* Expects an int as output, sets it to a combinaison of MUX_CTL_STATUS flags */ + MUX_CTL_EXIT_STATUS, /* Expects an int as output, sets the mux exist/error/http status, if known or 0 */ + MUX_CTL_REVERSE_CONN, /* Notify about an active reverse connection accepted. */ + MUX_CTL_SUBS_RECV, /* Notify the mux it must wait for read events again */ +}; + +/* sctl command used by mux->sctl() */ +enum mux_sctl_type { + MUX_SCTL_SID, /* Return the mux stream ID as ouput, as a signed 64bits integer */ +}; + +/* response for ctl MUX_STATUS */ +#define MUX_STATUS_READY (1 << 0) + +enum mux_exit_status { + MUX_ES_SUCCESS, /* Success */ + MUX_ES_INVALID_ERR, /* invalid input */ + MUX_ES_TOUT_ERR, /* timeout */ + MUX_ES_NOTIMPL_ERR, /* not-implemented error */ + MUX_ES_INTERNAL_ERR, /* internal error */ + MUX_ES_UNKNOWN /* unknown status (must be the last) */ +}; + +/* socks4 response length */ +#define SOCKS4_HS_RSP_LEN 8 + +/* socks4 upstream proxy definitions */ +struct socks4_request { + uint8_t version; /* SOCKS version number, 1 byte, must be 0x04 for this version */ + uint8_t command; /* 0x01 = establish a TCP/IP stream connection */ + uint16_t port; /* port number, 2 bytes (in network byte order) */ + uint32_t ip; /* IP address, 4 bytes (in network byte order) */ + char user_id[8]; /* the user ID string, variable length, terminated with a null (0x00); Using "HAProxy\0" */ +}; + +/* Describes a set of subscriptions. Multiple events may be registered at the + * same time. The callee should assume everything not pending for completion is + * implicitly possible. It's illegal to change the tasklet if events are still + * registered. + */ +struct wait_event { + struct tasklet *tasklet; + int events; /* set of enum sub_event_type above */ +}; + +/* A connection handle is how we differentiate two connections on the lower + * layers. It usually is a file descriptor but can be a connection id. The + * CO_FL_FDLESS flag indicates which one is relevant. + */ +union conn_handle { + struct quic_conn *qc; /* Only present if this connection is a QUIC one (CO_FL_FDLESS=1) */ + int fd; /* file descriptor, for regular sockets (CO_FL_FDLESS=0) */ +}; + +/* xprt_ops describes transport-layer operations for a connection. They + * generally run over a socket-based control layer, but not always. Some + * of them are used for data transfer with the upper layer (rcv_*, snd_*) + * and the other ones are used to setup and release the transport layer. + */ +struct xprt_ops { + size_t (*rcv_buf)(struct connection *conn, void *xprt_ctx, struct buffer *buf, size_t count, int flags); /* recv callback */ + size_t (*snd_buf)(struct connection *conn, void *xprt_ctx, const struct buffer *buf, size_t count, int flags); /* send callback */ + int (*rcv_pipe)(struct connection *conn, void *xprt_ctx, struct pipe *pipe, unsigned int count); /* recv-to-pipe callback */ + int (*snd_pipe)(struct connection *conn, void *xprt_ctx, struct pipe *pipe, unsigned int count); /* send-to-pipe callback */ + void (*shutr)(struct connection *conn, void *xprt_ctx, int); /* shutr function */ + void (*shutw)(struct connection *conn, void *xprt_ctx, int); /* shutw function */ + void (*close)(struct connection *conn, void *xprt_ctx); /* close the transport layer */ + int (*init)(struct connection *conn, void **ctx); /* initialize the transport layer */ + int (*start)(struct connection *conn, void *ctx); /* Start the transport layer, if needed */ + int (*prepare_bind_conf)(struct bind_conf *conf); /* prepare a whole bind_conf */ + void (*destroy_bind_conf)(struct bind_conf *conf); /* destroy a whole bind_conf */ + int (*prepare_srv)(struct server *srv); /* prepare a server context */ + void (*destroy_srv)(struct server *srv); /* destroy a server context */ + int (*get_alpn)(const struct connection *conn, void *xprt_ctx, const char **str, int *len); /* get application layer name */ + int (*takeover)(struct connection *conn, void *xprt_ctx, int orig_tid); /* Let the xprt know the fd have been taken over */ + void (*set_idle)(struct connection *conn, void *xprt_ctx); /* notify the xprt that the connection becomes idle. implies set_used. */ + void (*set_used)(struct connection *conn, void *xprt_ctx); /* notify the xprt that the connection leaves idle. implies set_idle. */ + char name[8]; /* transport layer name, zero-terminated */ + int (*subscribe)(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es); /* Subscribe <es> to events, such as "being able to send" */ + int (*unsubscribe)(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es); /* Unsubscribe <es> from events */ + int (*remove_xprt)(struct connection *conn, void *xprt_ctx, void *toremove_ctx, const struct xprt_ops *newops, void *newctx); /* Remove an xprt from the connection, used by temporary xprt such as the handshake one */ + int (*add_xprt)(struct connection *conn, void *xprt_ctx, void *toadd_ctx, const struct xprt_ops *toadd_ops, void **oldxprt_ctx, const struct xprt_ops **oldxprt_ops); /* Add a new XPRT as the new xprt, and return the old one */ + struct ssl_sock_ctx *(*get_ssl_sock_ctx)(struct connection *); /* retrieve the ssl_sock_ctx in use, or NULL if none */ + int (*show_fd)(struct buffer *, const struct connection *, const void *ctx); /* append some data about xprt for "show fd"; returns non-zero if suspicious */ +}; + +/* mux_ops describes the mux operations, which are to be performed at the + * connection level after data are exchanged with the transport layer in order + * to propagate them to streams. The <init> function will automatically be + * called once the mux is instantiated by the connection's owner at the end + * of a transport handshake, when it is about to transfer data and the data + * layer is not ready yet. + */ +struct mux_ops { + int (*init)(struct connection *conn, struct proxy *prx, struct session *sess, struct buffer *input); /* early initialization */ + int (*wake)(struct connection *conn); /* mux-layer callback to report activity, mandatory */ + size_t (*rcv_buf)(struct stconn *sc, struct buffer *buf, size_t count, int flags); /* Called from the upper layer to get data */ + size_t (*snd_buf)(struct stconn *sc, struct buffer *buf, size_t count, int flags); /* Called from the upper layer to send data */ + size_t (*nego_fastfwd)(struct stconn *sc, struct buffer *input, size_t count, unsigned int may_splice); /* Callback to fill the SD iobuf */ + size_t (*done_fastfwd)(struct stconn *sc); /* Callback to terminate fast data forwarding */ + int (*fastfwd)(struct stconn *sc, unsigned int count, unsigned int flags); /* Callback to init fast data forwarding */ + int (*resume_fastfwd)(struct stconn *sc, unsigned int flags); /* Callback to resume fast data forwarding */ + void (*shutr)(struct stconn *sc, enum co_shr_mode); /* shutr function */ + void (*shutw)(struct stconn *sc, enum co_shw_mode); /* shutw function */ + + int (*attach)(struct connection *conn, struct sedesc *, struct session *sess); /* attach a stconn to an outgoing connection */ + struct stconn *(*get_first_sc)(const struct connection *); /* retrieves any valid stconn from this connection */ + void (*detach)(struct sedesc *); /* Detach an stconn from the stdesc from an outgoing connection, when the request is done */ + int (*show_fd)(struct buffer *, struct connection *); /* append some data about connection into chunk for "show fd"; returns non-zero if suspicious */ + int (*show_sd)(struct buffer *, struct sedesc *, const char *pfx); /* append some data about the mux stream into chunk for "show sess"; returns non-zero if suspicious */ + int (*subscribe)(struct stconn *sc, int event_type, struct wait_event *es); /* Subscribe <es> to events, such as "being able to send" */ + int (*unsubscribe)(struct stconn *sc, int event_type, struct wait_event *es); /* Unsubscribe <es> from events */ + int (*sctl)(struct stconn *sc, enum mux_sctl_type mux_sctl, void *arg); /* Provides information about the mux stream */ + int (*avail_streams)(struct connection *conn); /* Returns the number of streams still available for a connection */ + int (*avail_streams_bidi)(struct connection *conn); /* Returns the number of bidirectional streams still available for a connection */ + int (*avail_streams_uni)(struct connection *conn); /* Returns the number of unidirectional streams still available for a connection */ + int (*used_streams)(struct connection *conn); /* Returns the number of streams in use on a connection. */ + void (*destroy)(void *ctx); /* Let the mux know one of its users left, so it may have to disappear */ + int (*ctl)(struct connection *conn, enum mux_ctl_type mux_ctl, void *arg); /* Provides information about the mux connection */ + int (*takeover)(struct connection *conn, int orig_tid); /* Attempts to migrate the connection to the current thread */ + unsigned int flags; /* some flags characterizing the mux's capabilities (MX_FL_*) */ + char name[8]; /* mux layer name, zero-terminated */ +}; + +/* list of frontend connections. Used to call mux wake operation on soft-stop + * to close idling connections. + */ +struct mux_stopping_data { + struct list list; /* list of registered frontend connections */ + struct task *task; /* task woken up on soft-stop */ +}; + +struct my_tcphdr { + uint16_t source; + uint16_t dest; +}; + +/* a connection source profile defines all the parameters needed to properly + * bind an outgoing connection for a server or proxy. + */ +struct conn_src { + unsigned int opts; /* CO_SRC_* */ + int iface_len; /* bind interface name length */ + char *iface_name; /* bind interface name or NULL */ + struct port_range *sport_range; /* optional per-server TCP source ports */ + struct sockaddr_storage source_addr; /* the address to which we want to bind for connect() */ +#if defined(CONFIG_HAP_TRANSPARENT) + struct sockaddr_storage tproxy_addr; /* non-local address we want to bind to for connect() */ + char *bind_hdr_name; /* bind to this header name if defined */ + int bind_hdr_len; /* length of the name of the header above */ + int bind_hdr_occ; /* occurrence number of header above: >0 = from first, <0 = from end, 0=disabled */ +#endif +}; + +/* Hash header flag reflecting the input parameters present + * CAUTION! Always update CONN_HASH_PARAMS_TYPE_COUNT when adding a new entry. + */ +enum conn_hash_params_t { + CONN_HASH_PARAMS_TYPE_SNI = 0x1, + CONN_HASH_PARAMS_TYPE_DST_ADDR = 0x2, + CONN_HASH_PARAMS_TYPE_DST_PORT = 0x4, + CONN_HASH_PARAMS_TYPE_SRC_ADDR = 0x8, + CONN_HASH_PARAMS_TYPE_SRC_PORT = 0x10, + CONN_HASH_PARAMS_TYPE_PROXY = 0x20, +}; +#define CONN_HASH_PARAMS_TYPE_COUNT 6 + +#define CONN_HASH_PAYLOAD_LEN \ + (((sizeof(((struct conn_hash_node *)0)->node.key)) * 8) - CONN_HASH_PARAMS_TYPE_COUNT) + +#define CONN_HASH_GET_PAYLOAD(hash) \ + (((hash) << CONN_HASH_PARAMS_TYPE_COUNT) >> CONN_HASH_PARAMS_TYPE_COUNT) + +/* To avoid overflow, dynamically sized parameters must be pre-hashed. Their + * hashed will then be reused as input for the generation of the final + * connection hash. + */ +struct conn_hash_params { + uint64_t sni_prehash; + uint64_t proxy_prehash; + void *target; + struct sockaddr_storage *src_addr; + struct sockaddr_storage *dst_addr; +}; + +/* + * This structure describes an TLV entry consisting of its type + * and corresponding payload. This can be used to construct a list + * from which arbitrary TLV payloads can be fetched. + * It might be possible to embed the 'tlv struct' here in the future. + */ +struct conn_tlv_list { + struct list list; + unsigned short len; // 65535 should be more than enough! + unsigned char type; + char value[0]; +} __attribute__((packed)); + +/* This structure describes a connection with its methods and data. + * A connection may be performed to proxy or server via a local or remote + * socket, and can also be made to an internal applet. It can support + * several transport schemes (raw, ssl, ...). It can support several + * connection control schemes, generally a protocol for socket-oriented + * connections, but other methods for applets. + */ +struct connection { + /* first cache line */ + enum obj_type obj_type; /* differentiates connection from applet context */ + unsigned char err_code; /* CO_ER_* */ + signed short send_proxy_ofs; /* <0 = offset to (re)send from the end, >0 = send all (reused for SOCKS4) */ + unsigned int flags; /* CO_FL_* */ + const struct protocol *ctrl; /* operations at the socket layer */ + const struct xprt_ops *xprt; /* operations at the transport layer */ + const struct mux_ops *mux; /* mux layer operations. Must be set before xprt->init() */ + void *xprt_ctx; /* general purpose pointer, initialized to NULL */ + void *ctx; /* highest level context (usually the mux), initialized to NULL */ + void *owner; /* pointer to the owner session, or NULL */ + enum obj_type *target; /* the target to connect to (server, proxy, applet, ...) */ + + /* second cache line */ + struct wait_event *subs; /* Task to wake when awaited events are ready */ + union { + struct list idle_list; /* list element for idle connection in server idle list */ + struct mt_list toremove_list; /* list element when idle connection is ready to be purged */ + }; + union { + struct list session_list; /* used by backend conns, list of attached connections to a session */ + struct list stopping_list; /* used by frontend conns, attach point in mux stopping list */ + }; + union conn_handle handle; /* connection handle at the socket layer */ + const struct netns_entry *proxy_netns; + + /* third cache line and beyond */ + void (*destroy_cb)(struct connection *conn); /* callback to notify of imminent death of the connection */ + struct sockaddr_storage *src; /* source address (pool), when known, otherwise NULL */ + struct sockaddr_storage *dst; /* destination address (pool), when known, otherwise NULL */ + struct list tlv_list; /* list of TLVs received via PROXYv2 */ + + /* used to identify a backend connection for http-reuse, + * thus only present if conn.target is of type OBJ_TYPE_SERVER + */ + struct conn_hash_node *hash_node; + + /* Members used if connection must be reversed. */ + struct { + enum obj_type *target; /* Listener for active reverse, server for passive. */ + struct buffer name; /* Only used for passive reverse. Used as SNI when connection added to server idle pool. */ + } reverse; +}; + +/* node for backend connection in the idle trees for http-reuse + * A connection is identified by a hash generated from its specific parameters + */ +struct conn_hash_node { + struct eb64_node node; /* contains the hashing key */ + struct connection *conn; /* connection owner of the node */ +}; + +struct mux_proto_list { + const struct ist token; /* token name and length. Empty is catch-all */ + enum proto_proxy_mode mode; + enum proto_proxy_side side; + const struct mux_ops *mux; + struct list list; +}; + +/* proxy protocol stuff below */ + +/* proxy protocol v2 definitions */ +#define PP2_SIGNATURE "\x0D\x0A\x0D\x0A\x00\x0D\x0A\x51\x55\x49\x54\x0A" +#define PP2_SIGNATURE_LEN 12 +#define PP2_HEADER_LEN 16 + +/* ver_cmd byte */ +#define PP2_CMD_LOCAL 0x00 +#define PP2_CMD_PROXY 0x01 +#define PP2_CMD_MASK 0x0F + +#define PP2_VERSION 0x20 +#define PP2_VERSION_MASK 0xF0 + +/* fam byte */ +#define PP2_TRANS_UNSPEC 0x00 +#define PP2_TRANS_STREAM 0x01 +#define PP2_TRANS_DGRAM 0x02 +#define PP2_TRANS_MASK 0x0F + +#define PP2_FAM_UNSPEC 0x00 +#define PP2_FAM_INET 0x10 +#define PP2_FAM_INET6 0x20 +#define PP2_FAM_UNIX 0x30 +#define PP2_FAM_MASK 0xF0 + +#define PP2_ADDR_LEN_UNSPEC (0) +#define PP2_ADDR_LEN_INET (4 + 4 + 2 + 2) +#define PP2_ADDR_LEN_INET6 (16 + 16 + 2 + 2) +#define PP2_ADDR_LEN_UNIX (108 + 108) + +#define PP2_HDR_LEN_UNSPEC (PP2_HEADER_LEN + PP2_ADDR_LEN_UNSPEC) +#define PP2_HDR_LEN_INET (PP2_HEADER_LEN + PP2_ADDR_LEN_INET) +#define PP2_HDR_LEN_INET6 (PP2_HEADER_LEN + PP2_ADDR_LEN_INET6) +#define PP2_HDR_LEN_UNIX (PP2_HEADER_LEN + PP2_ADDR_LEN_UNIX) + +#define PP2_TYPE_ALPN 0x01 +#define PP2_TYPE_AUTHORITY 0x02 +#define PP2_TYPE_CRC32C 0x03 +#define PP2_TYPE_NOOP 0x04 +#define PP2_TYPE_UNIQUE_ID 0x05 +#define PP2_TYPE_SSL 0x20 +#define PP2_SUBTYPE_SSL_VERSION 0x21 +#define PP2_SUBTYPE_SSL_CN 0x22 +#define PP2_SUBTYPE_SSL_CIPHER 0x23 +#define PP2_SUBTYPE_SSL_SIG_ALG 0x24 +#define PP2_SUBTYPE_SSL_KEY_ALG 0x25 +#define PP2_TYPE_NETNS 0x30 + +#define PP2_CLIENT_SSL 0x01 +#define PP2_CLIENT_CERT_CONN 0x02 +#define PP2_CLIENT_CERT_SESS 0x04 + +#define PP2_CRC32C_LEN 4 /* Length of a CRC32C TLV value */ + +#define TLV_HEADER_SIZE 3 + +#define HA_PP2_AUTHORITY_MAX 255 /* Maximum length of an authority TLV */ +#define HA_PP2_TLV_VALUE_128 128 /* E.g., accommodate unique IDs (128 B) */ +#define HA_PP2_TLV_VALUE_256 256 /* E.g., accommodate authority TLVs (currently, <= 255 B) */ +#define HA_PP2_MAX_ALLOC 1024 /* Maximum TLV value for PPv2 to prevent DoS */ + +struct proxy_hdr_v2 { + uint8_t sig[12]; /* hex 0D 0A 0D 0A 00 0D 0A 51 55 49 54 0A */ + uint8_t ver_cmd; /* protocol version and command */ + uint8_t fam; /* protocol family and transport */ + uint16_t len; /* number of following bytes part of the header */ + union { + struct { /* for TCP/UDP over IPv4, len = 12 */ + uint32_t src_addr; + uint32_t dst_addr; + uint16_t src_port; + uint16_t dst_port; + } ip4; + struct { /* for TCP/UDP over IPv6, len = 36 */ + uint8_t src_addr[16]; + uint8_t dst_addr[16]; + uint16_t src_port; + uint16_t dst_port; + } ip6; + struct { /* for AF_UNIX sockets, len = 216 */ + uint8_t src_addr[108]; + uint8_t dst_addr[108]; + } unx; + } addr; +}; + +struct tlv { + uint8_t type; + uint8_t length_hi; + uint8_t length_lo; + uint8_t value[0]; // WT: don't use VAR_ARRAY here, it's an end of struct marker +}__attribute__((packed)); + +struct tlv_ssl { + struct tlv tlv; + uint8_t client; + uint32_t verify; + uint8_t sub_tlv[VAR_ARRAY]; +}__attribute__((packed)); + + +/* This structure is used to manage idle connections, their locking, and the + * list of such idle connections to be removed. It is per-thread and must be + * accessible from foreign threads. + */ +struct idle_conns { + struct mt_list toremove_conns; + struct task *cleanup_task; + __decl_thread(HA_SPINLOCK_T idle_conns_lock); +} THREAD_ALIGNED(64); + +#endif /* _HAPROXY_CONNECTION_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/connection.h b/include/haproxy/connection.h new file mode 100644 index 0000000..c7d9883 --- /dev/null +++ b/include/haproxy/connection.h @@ -0,0 +1,762 @@ +/* + * include/haproxy/connection.h + * This file contains connection function prototypes + * + * Copyright (C) 2000-2002 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_CONNECTION_H +#define _HAPROXY_CONNECTION_H + +#include <import/ist.h> + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/connection-t.h> +#include <haproxy/stconn-t.h> +#include <haproxy/fd.h> +#include <haproxy/list.h> +#include <haproxy/listener-t.h> +#include <haproxy/obj_type.h> +#include <haproxy/pool-t.h> +#include <haproxy/server.h> +#include <haproxy/session-t.h> +#include <haproxy/task-t.h> + +extern struct pool_head *pool_head_connection; +extern struct pool_head *pool_head_conn_hash_node; +extern struct pool_head *pool_head_sockaddr; +extern struct pool_head *pool_head_pp_tlv_128; +extern struct pool_head *pool_head_pp_tlv_256; +extern struct pool_head *pool_head_uniqueid; +extern struct xprt_ops *registered_xprt[XPRT_ENTRIES]; +extern struct mux_proto_list mux_proto_list; +extern struct mux_stopping_data mux_stopping_data[MAX_THREADS]; + +#define IS_HTX_CONN(conn) ((conn)->mux && ((conn)->mux->flags & MX_FL_HTX)) + +/* receive a PROXY protocol header over a connection */ +int conn_recv_proxy(struct connection *conn, int flag); +int conn_send_proxy(struct connection *conn, unsigned int flag); +int make_proxy_line(char *buf, int buf_len, struct server *srv, struct connection *remote, struct stream *strm); +struct conn_tlv_list *conn_get_tlv(struct connection *conn, int type); + +int conn_append_debug_info(struct buffer *buf, const struct connection *conn, const char *pfx); + +int conn_subscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es); +int conn_unsubscribe(struct connection *conn, void *xprt_ctx, int event_type, struct wait_event *es); + +/* receive a NetScaler Client IP insertion header over a connection */ +int conn_recv_netscaler_cip(struct connection *conn, int flag); + +/* raw send() directly on the socket */ +int conn_ctrl_send(struct connection *conn, const void *buf, int len, int flags); + +/* drains any pending bytes from the socket */ +int conn_ctrl_drain(struct connection *conn); + +/* scoks4 proxy handshake */ +int conn_send_socks4_proxy_request(struct connection *conn); +int conn_recv_socks4_proxy_response(struct connection *conn); + +/* If we delayed the mux creation because we were waiting for the handshake, do it now */ +int conn_create_mux(struct connection *conn); +int conn_notify_mux(struct connection *conn, int old_flags, int forced_wake); +int conn_upgrade_mux_fe(struct connection *conn, void *ctx, struct buffer *buf, + struct ist mux_proto, int mode); +int conn_install_mux_fe(struct connection *conn, void *ctx); +int conn_install_mux_be(struct connection *conn, void *ctx, struct session *sess, + const struct mux_ops *force_mux_ops); +int conn_install_mux_chk(struct connection *conn, void *ctx, struct session *sess); + +void conn_delete_from_tree(struct connection *conn); + +void conn_init(struct connection *conn, void *target); +struct connection *conn_new(void *target); +void conn_free(struct connection *conn); +struct conn_hash_node *conn_alloc_hash_node(struct connection *conn); +struct sockaddr_storage *sockaddr_alloc(struct sockaddr_storage **sap, const struct sockaddr_storage *orig, socklen_t len); +void sockaddr_free(struct sockaddr_storage **sap); + + +/* connection hash stuff */ +uint64_t conn_calculate_hash(const struct conn_hash_params *params); +uint64_t conn_hash_prehash(char *buf, size_t size); +void conn_hash_update(char *buf, size_t *idx, + const void *data, size_t size, + enum conn_hash_params_t *flags, + enum conn_hash_params_t type); +uint64_t conn_hash_digest(char *buf, size_t bufsize, + enum conn_hash_params_t flags); + +int conn_reverse(struct connection *conn); + +const char *conn_err_code_str(struct connection *c); +int xprt_add_hs(struct connection *conn); +void register_mux_proto(struct mux_proto_list *list); + +extern struct idle_conns idle_conns[MAX_THREADS]; + +/* returns true if the transport layer is ready */ +static inline int conn_xprt_ready(const struct connection *conn) +{ + return (conn->flags & CO_FL_XPRT_READY); +} + +/* returns true if the control layer is ready */ +static inline int conn_ctrl_ready(const struct connection *conn) +{ + return (conn->flags & CO_FL_CTRL_READY); +} + +/* + * Calls the start() function of the transport layer, if needed. + * Returns < 0 in case of error. +*/ + +static inline int conn_xprt_start(struct connection *conn) +{ + int ret = 0; + + if (!conn_xprt_ready(conn) && conn->xprt && conn->xprt->start) + ret = conn->xprt->start(conn, conn->xprt_ctx); + + if (ret >= 0) + conn->flags |= CO_FL_XPRT_READY; + + return ret; +} + +/* Calls the close() function of the transport layer if any and if not done + * yet, and clears the CO_FL_XPRT_READY flags + * However this is not done if the CO_FL_XPRT_TRACKED flag is set, + * which allows logs to take data from the transport layer very late if needed. + */ +static inline void conn_xprt_close(struct connection *conn) +{ + if (conn->xprt && !(conn->flags & CO_FL_XPRT_TRACKED)) { + if (conn->xprt->close) + conn->xprt->close(conn, conn->xprt_ctx); + conn->xprt_ctx = NULL; + conn->flags &= ~CO_FL_XPRT_READY; + conn->xprt = NULL; + } +} + +/* Initializes the connection's control layer which essentially consists in + * registering the connection handle (e.g. file descriptor) for events and + * setting the CO_FL_CTRL_READY flag. The caller is responsible for ensuring + * that the control layer is already assigned to the connection prior to the + * call. + */ +static inline void conn_ctrl_init(struct connection *conn) +{ + if (!conn_ctrl_ready(conn)) { + conn->flags |= CO_FL_CTRL_READY; + if (conn->ctrl->ctrl_init) + conn->ctrl->ctrl_init(conn); + } +} + +/* Deletes the connection's handle (e.g. FD) if the transport layer is already + * gone, and removes the CO_FL_CTRL_READY flag. + */ +static inline void conn_ctrl_close(struct connection *conn) +{ + if (!conn->xprt && (conn->flags & CO_FL_CTRL_READY)) { + if ((conn->flags & (CO_FL_WANT_DRAIN | CO_FL_SOCK_RD_SH)) == CO_FL_WANT_DRAIN) + conn_ctrl_drain(conn); + conn->flags &= ~CO_FL_CTRL_READY; + if (conn->ctrl->ctrl_close) + conn->ctrl->ctrl_close(conn); + } +} + +/* If the connection still has a transport layer, then call its close() function + * if any, and delete the file descriptor if a control layer is set. This is + * used to close everything at once and atomically. However this is not done if + * the CO_FL_XPRT_TRACKED flag is set, which allows logs to take data from the + * transport layer very late if needed. + */ +static inline void conn_full_close(struct connection *conn) +{ + conn_xprt_close(conn); + conn_ctrl_close(conn); +} + +/* stop tracking a connection, allowing conn_full_close() to always + * succeed. + */ +static inline void conn_stop_tracking(struct connection *conn) +{ + conn->flags &= ~CO_FL_XPRT_TRACKED; +} + +/* returns the connection's FD if the connection exists, its control is ready, + * and the connection has an FD, otherwise -1. + */ +static inline int conn_fd(const struct connection *conn) +{ + if (!conn || !conn_ctrl_ready(conn) || (conn->flags & CO_FL_FDLESS)) + return -1; + return conn->handle.fd; +} + +/* read shutdown, called from the rcv_buf/rcv_pipe handlers when + * detecting an end of connection. + */ +static inline void conn_sock_read0(struct connection *c) +{ + c->flags |= CO_FL_SOCK_RD_SH; + if (conn_ctrl_ready(c)) { + /* we don't risk keeping ports unusable if we found the + * zero from the other side. + */ + BUG_ON(c->flags & CO_FL_FDLESS); + HA_ATOMIC_AND(&fdtab[c->handle.fd].state, ~FD_LINGER_RISK); + } +} + +/* write shutdown, indication that the upper layer is not willing to send + * anything anymore and wants to close after pending data are sent. The + * <clean> argument will allow not to perform the socket layer shutdown if + * equal to 0. + */ +static inline void conn_sock_shutw(struct connection *c, int clean) +{ + c->flags |= CO_FL_SOCK_WR_SH; + if (conn_ctrl_ready(c)) { + /* don't perform a clean shutdown if we're going to reset or + * if the shutr was already received. + */ + BUG_ON(c->flags & CO_FL_FDLESS); + if (!(c->flags & CO_FL_SOCK_RD_SH) && clean) + shutdown(c->handle.fd, SHUT_WR); + } +} + +static inline void conn_xprt_shutw(struct connection *c) +{ + /* clean data-layer shutdown */ + if (c->xprt && c->xprt->shutw) + c->xprt->shutw(c, c->xprt_ctx, 1); +} + +static inline void conn_xprt_shutw_hard(struct connection *c) +{ + /* unclean data-layer shutdown */ + if (c->xprt && c->xprt->shutw) + c->xprt->shutw(c, c->xprt_ctx, 0); +} + + +/* detect sock->data read0 transition */ +static inline int conn_xprt_read0_pending(struct connection *c) +{ + return (c->flags & CO_FL_SOCK_RD_SH) != 0; +} + +/* prepares a connection to work with protocol <proto> and transport <xprt>. + * The transport's is initialized as well, and the mux and its context are + * cleared. The target is not reinitialized and it is recommended that it is + * set prior to calling this function so that the function may make use of it + * in the future to refine the mux choice if needed. + */ +static inline int conn_prepare(struct connection *conn, const struct protocol *proto, const struct xprt_ops *xprt) +{ + int ret = 0; + + conn->ctrl = proto; + conn->xprt = xprt; + conn->mux = NULL; + conn->xprt_ctx = NULL; + conn->ctx = NULL; + if (xprt->init) { + ret = xprt->init(conn, &conn->xprt_ctx); + if (ret < 0) + conn->xprt = NULL; + } + return ret; +} + +/* returns 0 if the connection is valid and is a frontend connection, otherwise + * returns 1 indicating it's a backend connection. And uninitialized connection + * also returns 1 to better handle the usage in the middle of initialization. + */ +static inline int conn_is_back(const struct connection *conn) +{ + return !objt_listener(conn->target); +} + +/* sets <owner> as the connection's owner */ +static inline void conn_set_owner(struct connection *conn, void *owner, void (*cb)(struct connection *)) +{ + conn->owner = owner; + conn->destroy_cb = cb; +} + + +/* Mark the connection <conn> as private and remove it from the available connection list */ +static inline void conn_set_private(struct connection *conn) +{ + if (!(conn->flags & CO_FL_PRIVATE)) { + conn->flags |= CO_FL_PRIVATE; + + if (obj_type(conn->target) == OBJ_TYPE_SERVER) + srv_release_conn(__objt_server(conn->target), conn); + } +} + +static inline void conn_force_unsubscribe(struct connection *conn) +{ + if (!conn->subs) + return; + conn->subs->events = 0; + conn->subs = NULL; +} + +/* Returns the source address of the connection or NULL if not set */ +static inline const struct sockaddr_storage *conn_src(struct connection *conn) +{ + return conn->src; +} + +/* Returns the destination address of the connection or NULL if not set */ +static inline const struct sockaddr_storage *conn_dst(struct connection *conn) +{ + return conn->dst; +} + +/* Retrieves the connection's original source address. Returns non-zero on + * success or zero on failure. The operation is only performed once and the + * address is stored in the connection for future use. + */ +static inline int conn_get_src(struct connection *conn) +{ + if (conn->src) + return 1; + + if (!conn_ctrl_ready(conn)) + goto fail; + + if (!sockaddr_alloc(&conn->src, NULL, 0)) + goto fail; + + /* some stream protocols may provide their own get_src/dst functions */ + if (conn->ctrl->get_src && + conn->ctrl->get_src(conn, (struct sockaddr *)conn->src, sizeof(*conn->src)) != -1) + goto done; + + if (conn->ctrl->proto_type != PROTO_TYPE_STREAM) + goto fail; + + /* most other socket-based stream protocols will use their socket family's functions */ + if (conn->ctrl->fam->get_src && !(conn->flags & CO_FL_FDLESS) && + conn->ctrl->fam->get_src(conn->handle.fd, (struct sockaddr *)conn->src, + sizeof(*conn->src), + obj_type(conn->target) != OBJ_TYPE_LISTENER) != -1) + goto done; + + /* no other means */ + fail: + sockaddr_free(&conn->src); + return 0; + done: + return 1; +} + +/* Retrieves the connection's original destination address. Returns non-zero on + * success or zero on failure. The operation is only performed once and the + * address is stored in the connection for future use. + */ +static inline int conn_get_dst(struct connection *conn) +{ + if (conn->dst) + return 1; + + if (!conn_ctrl_ready(conn)) + goto fail; + + if (!sockaddr_alloc(&conn->dst, NULL, 0)) + goto fail; + + /* some stream protocols may provide their own get_src/dst functions */ + if (conn->ctrl->get_dst && + conn->ctrl->get_dst(conn, (struct sockaddr *)conn->dst, sizeof(*conn->dst)) != -1) + goto done; + + if (conn->ctrl->proto_type != PROTO_TYPE_STREAM) + goto fail; + + /* most other socket-based stream protocols will use their socket family's functions */ + if (conn->ctrl->fam->get_dst && !(conn->flags & CO_FL_FDLESS) && + conn->ctrl->fam->get_dst(conn->handle.fd, (struct sockaddr *)conn->dst, + sizeof(*conn->dst), + obj_type(conn->target) != OBJ_TYPE_LISTENER) != -1) + goto done; + + /* no other means */ + fail: + sockaddr_free(&conn->dst); + return 0; + done: + return 1; +} + +/* Sets the TOS header in IPv4 and the traffic class header in IPv6 packets + * (as per RFC3260 #4 and BCP37 #4.2 and #5.2). The connection is tested and if + * it is null, nothing is done. + */ +static inline void conn_set_tos(const struct connection *conn, int tos) +{ + if (!conn || !conn_ctrl_ready(conn) || (conn->flags & CO_FL_FDLESS)) + return; + +#ifdef IP_TOS + if (conn->src->ss_family == AF_INET) + setsockopt(conn->handle.fd, IPPROTO_IP, IP_TOS, &tos, sizeof(tos)); +#endif +#ifdef IPV6_TCLASS + if (conn->src->ss_family == AF_INET6) { + if (IN6_IS_ADDR_V4MAPPED(&((struct sockaddr_in6 *)conn->src)->sin6_addr)) + /* v4-mapped addresses need IP_TOS */ + setsockopt(conn->handle.fd, IPPROTO_IP, IP_TOS, &tos, sizeof(tos)); + else + setsockopt(conn->handle.fd, IPPROTO_IPV6, IPV6_TCLASS, &tos, sizeof(tos)); + } +#endif +} + +/* Sets the netfilter mark on the connection's socket. The connection is tested + * and if it is null, nothing is done. + */ +static inline void conn_set_mark(const struct connection *conn, int mark) +{ + if (!conn || !conn_ctrl_ready(conn) || (conn->flags & CO_FL_FDLESS)) + return; + +#if defined(SO_MARK) + setsockopt(conn->handle.fd, SOL_SOCKET, SO_MARK, &mark, sizeof(mark)); +#elif defined(SO_USER_COOKIE) + setsockopt(conn->handle.fd, SOL_SOCKET, SO_USER_COOKIE, &mark, sizeof(mark)); +#elif defined(SO_RTABLE) + setsockopt(conn->handle.fd, SOL_SOCKET, SO_RTABLE, &mark, sizeof(mark)); +#endif +} + +/* Sets adjust the TCP quick-ack feature on the connection's socket. The + * connection is tested and if it is null, nothing is done. + */ +static inline void conn_set_quickack(const struct connection *conn, int value) +{ + if (!conn || !conn_ctrl_ready(conn) || (conn->flags & CO_FL_FDLESS)) + return; + +#ifdef TCP_QUICKACK + setsockopt(conn->handle.fd, IPPROTO_TCP, TCP_QUICKACK, &value, sizeof(value)); +#endif +} + +static inline struct wait_event *wl_set_waitcb(struct wait_event *wl, struct task *(*cb)(struct task *, void *, unsigned int), void *ctx) +{ + if (!wl->tasklet->process) { + wl->tasklet->process = cb; + wl->tasklet->context = ctx; + } + return wl; +} + +/* Installs the connection's mux layer for upper context <ctx>. + * Returns < 0 on error. + */ +static inline int conn_install_mux(struct connection *conn, const struct mux_ops *mux, + void *ctx, struct proxy *prx, struct session *sess) +{ + int ret; + + conn->mux = mux; + conn->ctx = ctx; + ret = mux->init ? mux->init(conn, prx, sess, &BUF_NULL) : 0; + if (ret < 0) { + conn->mux = NULL; + conn->ctx = NULL; + } + return ret; +} + +/* Retrieves any valid stream connector from this connection, preferably the first + * valid one. The purpose is to be able to figure one other end of a private + * connection for purposes like source binding or proxy protocol header + * emission. In such cases, any stream connector is expected to be valid so the + * mux is encouraged to return the first one it finds. If the connection has + * no mux or the mux has no get_first_sc() method or the mux has no valid + * stream connector, NULL is returned. The output pointer is purposely marked + * const to discourage the caller from modifying anything there. + */ +static inline struct stconn *conn_get_first_sc(const struct connection *conn) +{ + BUG_ON(!conn || !conn->mux); + + if (!conn->mux->get_first_sc) + return NULL; + return conn->mux->get_first_sc(conn); +} + +int conn_update_alpn(struct connection *conn, const struct ist alpn, int force); + +static inline const char *conn_get_ctrl_name(const struct connection *conn) +{ + if (!conn || !conn_ctrl_ready(conn)) + return "NONE"; + return conn->ctrl->name; +} + +static inline const char *conn_get_xprt_name(const struct connection *conn) +{ + if (!conn || !conn->xprt) + return "NONE"; + return conn->xprt->name; +} + +static inline const char *conn_get_mux_name(const struct connection *conn) +{ + if (!conn || !conn->mux) + return "NONE"; + return conn->mux->name; +} + +/* registers pointer to transport layer <id> (XPRT_*) */ +static inline void xprt_register(int id, struct xprt_ops *xprt) +{ + if (id >= XPRT_ENTRIES) + return; + registered_xprt[id] = xprt; +} + +/* returns pointer to transport layer <id> (XPRT_*) or NULL if not registered */ +static inline struct xprt_ops *xprt_get(int id) +{ + if (id >= XPRT_ENTRIES) + return NULL; + return registered_xprt[id]; +} + +/* notify the next xprt that the connection is about to become idle and that it + * may be stolen at any time after the function returns and that any tasklet in + * the chain must be careful before dereferencing its context. + */ +static inline void xprt_set_idle(struct connection *conn, const struct xprt_ops *xprt, void *xprt_ctx) +{ + if (xprt->set_idle) + xprt->set_idle(conn, conn->xprt_ctx); +} + +/* notify the next xprt that the connection is not idle anymore and that it may + * not be stolen before the next xprt_set_idle(). + */ +static inline void xprt_set_used(struct connection *conn, const struct xprt_ops *xprt, void *xprt_ctx) +{ + if (xprt->set_used) + xprt->set_used(conn, conn->xprt_ctx); +} + +static inline int conn_get_alpn(const struct connection *conn, const char **str, int *len) +{ + if (!conn_xprt_ready(conn) || !conn->xprt->get_alpn) + return 0; + return conn->xprt->get_alpn(conn, conn->xprt_ctx, str, len); +} + +/* unregisters proto mux list <list> */ +static inline void unregister_mux_proto(struct mux_proto_list *list) +{ + LIST_DELETE(&list->list); + LIST_INIT(&list->list); +} + +static inline struct mux_proto_list *get_mux_proto(const struct ist proto) +{ + struct mux_proto_list *item; + + list_for_each_entry(item, &mux_proto_list.list, list) { + if (isteq(proto, item->token)) + return item; + } + return NULL; +} + +void list_mux_proto(FILE *out); +/* returns the first mux entry in the list matching the exact same <mux_proto> + * and compatible with the <proto_side> (FE or BE) and the <proto_mode> (TCP or + * HTTP). <mux_proto> can be empty. Will fall back to the first compatible mux + * with exactly the same <proto_mode> or with an empty name. May return + * null if the code improperly registered the default mux to use as a fallback. + * + * <proto_mode> expects PROTO_MODE_* value only: PROXY_MODE_* values should + * never be used directly here (but you may use conn_pr_mode_to_proto_mode() + * to map proxy mode to corresponding proto mode before calling the function). + */ +static inline const struct mux_proto_list *conn_get_best_mux_entry( + const struct ist mux_proto, + int proto_side, int proto_mode) +{ + struct mux_proto_list *item; + struct mux_proto_list *fallback = NULL; + + list_for_each_entry(item, &mux_proto_list.list, list) { + if (!(item->side & proto_side) || !(item->mode & proto_mode)) + continue; + if (istlen(mux_proto) && isteq(mux_proto, item->token)) + return item; + else if (!istlen(item->token)) { + if (!fallback || (item->mode == proto_mode && fallback->mode != proto_mode)) + fallback = item; + } + } + return fallback; + +} + +/* returns the first mux in the list matching the exact same <mux_proto> and + * compatible with the <proto_side> (FE or BE) and the <proto_mode> (TCP or + * HTTP). <mux_proto> can be empty. Will fall back to the first compatible mux + * with exactly the same <proto_mode> or with an empty name. May return + * null if the code improperly registered the default mux to use as a fallback. + */ +static inline const struct mux_ops *conn_get_best_mux(struct connection *conn, + const struct ist mux_proto, + int proto_side, int proto_mode) +{ + const struct mux_proto_list *item; + + item = conn_get_best_mux_entry(mux_proto, proto_side, proto_mode); + + return item ? item->mux : NULL; +} + +/* returns a pointer to the proxy associated with this connection. For a front + * connection it returns a pointer to the frontend ; for a back connection, it + * returns a pointer to the backend. + */ +static inline struct proxy *conn_get_proxy(const struct connection *conn) +{ + struct listener *l; + struct server *s; + + /* check if it's a frontend connection */ + l = objt_listener(conn->target); + if (l) + return l->bind_conf->frontend; + + /* check if it's a backend connection */ + s = objt_server(conn->target); + if (s) + return s->proxy; + + return objt_proxy(conn->target); +} + +/* unconditionally retrieves the ssl_sock_ctx for this connection. Prefer using + * the standard form conn_get_ssl_sock_ctx() which checks the transport layer + * and the availability of the method. + */ +static inline struct ssl_sock_ctx *__conn_get_ssl_sock_ctx(struct connection *conn) +{ + return conn->xprt->get_ssl_sock_ctx(conn); +} + +/* retrieves the ssl_sock_ctx for this connection otherwise NULL */ +static inline struct ssl_sock_ctx *conn_get_ssl_sock_ctx(struct connection *conn) +{ + if (!conn || !conn->xprt || !conn->xprt->get_ssl_sock_ctx) + return NULL; + return conn->xprt->get_ssl_sock_ctx(conn); +} + +/* boolean, returns true if connection is over SSL */ +static inline int conn_is_ssl(struct connection *conn) +{ + return !!conn_get_ssl_sock_ctx(conn); +} + +/* Returns true if connection must be reversed. */ +static inline int conn_is_reverse(const struct connection *conn) +{ + return !!(conn->reverse.target); +} + +/* Returns true if connection must be actively reversed or waiting to be accepted. */ +static inline int conn_reverse_in_preconnect(const struct connection *conn) +{ + return conn_is_back(conn) ? !!(conn->reverse.target) : + !!(conn->flags & CO_FL_ACT_REVERSING); +} + +/* Initialize <conn> as a reverse connection to <target>. */ +static inline void conn_set_reverse(struct connection *conn, enum obj_type *target) +{ + /* Ensure the correct target type is used depending on the connection side before reverse. */ + BUG_ON((!conn_is_back(conn) && !objt_server(target)) || + (conn_is_back(conn) && !objt_listener(target))); + + conn->reverse.target = target; +} + +/* Returns the listener instance for connection used for active reverse. */ +static inline struct listener *conn_active_reverse_listener(const struct connection *conn) +{ + return conn_is_back(conn) ? __objt_listener(conn->reverse.target) : + __objt_listener(conn->target); +} + +/* + * Prepare TLV argument for redirecting fetches. + * Note that it is not possible to use an argument check function + * as that would require us to allow arguments for functions + * that do not need it. Alternatively, the sample logic could be + * adjusted to perform checks for no arguments and allocate + * in the check function. However, this does not seem worth the trouble. + */ +static inline void set_tlv_arg(int tlv_type, struct arg *tlv_arg) +{ + tlv_arg->type = ARGT_SINT; + tlv_arg->data.sint = tlv_type; +} + +/* + * Map proxy mode (PR_MODE_*) to equivalent proto_proxy_mode (PROTO_MODE_*) + */ +static inline int conn_pr_mode_to_proto_mode(int proxy_mode) +{ + int mode; + + /* for now we only support TCP and HTTP proto_modes, so we + * consider that if it's not HTTP, then it's TCP + */ + mode = 1 << (proxy_mode == PR_MODE_HTTP); + + return mode; +} + +#endif /* _HAPROXY_CONNECTION_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/counters-t.h b/include/haproxy/counters-t.h new file mode 100644 index 0000000..933c228 --- /dev/null +++ b/include/haproxy/counters-t.h @@ -0,0 +1,128 @@ +/* + * include/haproxy/counters-t.h + * This file contains structure declarations for statistics counters. + * + * Copyright 2008-2009 Krzysztof Piotr Oledzki <ole@ans.pl> + * Copyright 2011-2014 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_COUNTERS_T_H +#define _HAPROXY_COUNTERS_T_H + +/* counters used by listeners and frontends */ +struct fe_counters { + unsigned int conn_max; /* max # of active sessions */ + long long cum_conn; /* cumulated number of received connections */ + long long cum_sess; /* cumulated number of accepted connections */ + long long cum_sess_ver[3]; /* cumulated number of h1/h2/h3 sessions */ + + unsigned int cps_max; /* maximum of new connections received per second */ + unsigned int sps_max; /* maximum of new connections accepted per second (sessions) */ + + long long bytes_in; /* number of bytes transferred from the client to the server */ + long long bytes_out; /* number of bytes transferred from the server to the client */ + + /* compression counters, index 0 for requests, 1 for responses */ + long long comp_in[2]; /* input bytes fed to the compressor */ + long long comp_out[2]; /* output bytes emitted by the compressor */ + long long comp_byp[2]; /* input bytes that bypassed the compressor (cpu/ram/bw limitation) */ + + long long denied_req; /* blocked requests because of security concerns */ + long long denied_resp; /* blocked responses because of security concerns */ + long long failed_req; /* failed requests (eg: invalid or timeout) */ + long long denied_conn; /* denied connection requests (tcp-req-conn rules) */ + long long denied_sess; /* denied session requests (tcp-req-sess rules) */ + long long failed_rewrites; /* failed rewrites (warning) */ + long long internal_errors; /* internal processing errors */ + + long long cli_aborts; /* aborted responses during DATA phase caused by the client */ + long long srv_aborts; /* aborted responses during DATA phase caused by the server */ + long long intercepted_req; /* number of monitoring or stats requests intercepted by the frontend */ + + union { + struct { + long long cum_req[4]; /* cumulated number of processed other/h1/h2/h3 requests */ + long long comp_rsp; /* number of compressed responses */ + unsigned int rps_max; /* maximum of new HTTP requests second observed */ + long long rsp[6]; /* http response codes */ + long long cache_lookups;/* cache lookups */ + long long cache_hits; /* cache hits */ + } http; + } p; /* protocol-specific stats */ +}; + +/* counters used by servers and backends */ +struct be_counters { + unsigned int conn_max; /* max # of active sessions */ + long long cum_conn; /* cumulated number of received connections */ + long long cum_sess; /* cumulated number of accepted connections */ + long long cum_lbconn; /* cumulated number of sessions processed by load balancing (BE only) */ + unsigned long last_sess; /* last session time */ + + unsigned int cps_max; /* maximum of new connections received per second */ + unsigned int sps_max; /* maximum of new connections accepted per second (sessions) */ + unsigned int nbpend_max; /* max number of pending connections with no server assigned yet */ + unsigned int cur_sess_max; /* max number of currently active sessions */ + + long long bytes_in; /* number of bytes transferred from the client to the server */ + long long bytes_out; /* number of bytes transferred from the server to the client */ + + /* compression counters, index 0 for requests, 1 for responses */ + long long comp_in[2]; /* input bytes fed to the compressor */ + long long comp_out[2]; /* output bytes emitted by the compressor */ + long long comp_byp[2]; /* input bytes that bypassed the compressor (cpu/ram/bw limitation) */ + + long long denied_req; /* blocked requests because of security concerns */ + long long denied_resp; /* blocked responses because of security concerns */ + + long long connect; /* number of connection establishment attempts */ + long long reuse; /* number of connection reuses */ + long long failed_conns; /* failed connect() attempts (BE only) */ + long long failed_resp; /* failed responses (BE only) */ + long long cli_aborts; /* aborted responses during DATA phase caused by the client */ + long long srv_aborts; /* aborted responses during DATA phase caused by the server */ + long long retries; /* retried and redispatched connections (BE only) */ + long long redispatches; /* retried and redispatched connections (BE only) */ + long long failed_rewrites; /* failed rewrites (warning) */ + long long internal_errors; /* internal processing errors */ + + long long failed_checks, failed_hana; /* failed health checks and health analyses for servers */ + long long down_trans; /* up->down transitions */ + + unsigned int q_time, c_time, d_time, t_time; /* sums of conn_time, queue_time, data_time, total_time */ + unsigned int qtime_max, ctime_max, dtime_max, ttime_max; /* maximum of conn_time, queue_time, data_time, total_time observed */ + + union { + struct { + long long cum_req; /* cumulated number of processed HTTP requests */ + long long comp_rsp; /* number of compressed responses */ + unsigned int rps_max; /* maximum of new HTTP requests second observed */ + long long rsp[6]; /* http response codes */ + long long cache_lookups;/* cache lookups */ + long long cache_hits; /* cache hits */ + } http; + } p; /* protocol-specific stats */ +}; + +#endif /* _HAPROXY_COUNTERS_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/cpuset-t.h b/include/haproxy/cpuset-t.h new file mode 100644 index 0000000..d3ebb35 --- /dev/null +++ b/include/haproxy/cpuset-t.h @@ -0,0 +1,54 @@ +#ifndef _HAPROXY_CPUSET_T_H +#define _HAPROXY_CPUSET_T_H + +#define _GNU_SOURCE +#include <sched.h> + +#if defined(__FreeBSD__) || defined(__DragonFly__) || defined(__NetBSD__) +#include <sys/param.h> +#ifdef __FreeBSD__ +#include <sys/_cpuset.h> +#include <sys/cpuset.h> +#include <sys/sysctl.h> +#include <strings.h> +#endif +#endif + +#include <haproxy/api-t.h> + +#if defined(__linux__) || defined(__DragonFly__) || \ + (defined(__FreeBSD_kernel__) && defined(__GLIBC__)) + +# define CPUSET_REPR cpu_set_t +# define CPUSET_USE_CPUSET + +#elif defined(__FreeBSD__) || defined(__NetBSD__) + +# define CPUSET_REPR cpuset_t + +# if defined(__FreeBSD__) && __FreeBSD_version >= 1301000 +# define CPUSET_USE_CPUSET +# else +# define CPUSET_USE_FREEBSD_CPUSET +# endif + +#elif defined(__APPLE__) + +# define CPUSET_REPR unsigned long +# define CPUSET_USE_ULONG + +#else + +# error "No cpuset support implemented on this platform" + +#endif + +struct hap_cpuset { + CPUSET_REPR cpuset; +}; + +struct cpu_map { + struct hap_cpuset thread[MAX_THREADS_PER_GROUP]; /* list of CPU masks for the 32/64 threads of this group */ +}; + +#endif /* _HAPROXY_CPUSET_T_H */ diff --git a/include/haproxy/cpuset.h b/include/haproxy/cpuset.h new file mode 100644 index 0000000..87c4ece --- /dev/null +++ b/include/haproxy/cpuset.h @@ -0,0 +1,76 @@ +#ifndef _HAPROXY_CPUSET_H +#define _HAPROXY_CPUSET_H + +#include <haproxy/cpuset-t.h> + +extern struct cpu_map *cpu_map; + +/* Unset all indexes in <set>. + */ +void ha_cpuset_zero(struct hap_cpuset *set); + +/* Set <cpu> index in <set> if not present. + * Returns 0 on success otherwise non-zero. + */ +int ha_cpuset_set(struct hap_cpuset *set, int cpu); + +/* Clear <cpu> index in <set> if present. + * Returns 0 on success otherwise non-zero. + */ +int ha_cpuset_clr(struct hap_cpuset *set, int cpu); + +/* Bitwise and equivalent operation between <src> and <dst> stored in <dst>. + */ +void ha_cpuset_and(struct hap_cpuset *dst, struct hap_cpuset *src); + +/* Bitwise OR equivalent operation between <src> and <dst> stored in <dst>. + */ +void ha_cpuset_or(struct hap_cpuset *dst, struct hap_cpuset *src); + +/* returns non-zero if CPU index <cpu> is set in <set>, otherwise 0. */ +int ha_cpuset_isset(const struct hap_cpuset *set, int cpu); + +/* Returns the count of set index in <set>. + */ +int ha_cpuset_count(const struct hap_cpuset *set); + +/* Returns the first index set plus one in <set> starting from the lowest. + * Returns 0 if no index set. + * Do not forget to subtract the result by one if using it for set/clr. + */ +int ha_cpuset_ffs(const struct hap_cpuset *set); + +/* Copy <src> set into <dst>. + */ +void ha_cpuset_assign(struct hap_cpuset *dst, struct hap_cpuset *src); + +/* Returns the biggest index plus one usable on the platform. + */ +int ha_cpuset_size(void); + +/* Detects CPUs that are bound to the current process. Returns the number of + * CPUs detected or 0 if the detection failed. + */ +int ha_cpuset_detect_bound(struct hap_cpuset *set); + +/* Parse cpu sets. Each CPU set is either a unique number between 0 and + * ha_cpuset_size() - 1 or a range with two such numbers delimited by a dash + * ('-'). Each CPU set can be a list of unique numbers or ranges separated by + * a comma. It is also possible to specify multiple cpu numbers or ranges in + * distinct argument in <args>. On success, it returns 0, otherwise it returns + * 1 with an error message in <err>. + */ +int parse_cpu_set(const char **args, struct hap_cpuset *cpu_set, char **err); + +/* Parse a linux cpu map string representing to a numeric cpu mask map + * The cpu map string is a list of 4-byte hex strings separated by commas, with + * most-significant byte first, one bit per cpu number. + */ +void parse_cpumap(char *cpumap_str, struct hap_cpuset *cpu_set); + +/* Returns true if at least one cpu-map directive was configured, otherwise + * false. + */ +int cpu_map_configured(void); + +#endif /* _HAPROXY_CPUSET_H */ diff --git a/include/haproxy/debug.h b/include/haproxy/debug.h new file mode 100644 index 0000000..b7a2e20 --- /dev/null +++ b/include/haproxy/debug.h @@ -0,0 +1,39 @@ +/* + * include/haproxy/debug.h + * This files contains some macros to help debugging. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_DEBUG_H +#define _HAPROXY_DEBUG_H + +struct task; +struct buffer; +extern unsigned int debug_commands_issued; +void ha_task_dump(struct buffer *buf, const struct task *task, const char *pfx); +void ha_thread_dump_one(int thr, int from_signal); +void ha_thread_dump(struct buffer *buf, int thr); +void ha_dump_backtrace(struct buffer *buf, const char *prefix, int dump); +void ha_backtrace_to_stderr(void); +void ha_panic(void); + +void post_mortem_add_component(const char *name, const char *version, + const char *toolchain, const char *toolchain_opts, + const char *build_settings, const char *path); + +#endif /* _HAPROXY_DEBUG_H */ diff --git a/include/haproxy/defaults.h b/include/haproxy/defaults.h new file mode 100644 index 0000000..7430c61 --- /dev/null +++ b/include/haproxy/defaults.h @@ -0,0 +1,533 @@ +/* + * include/haproxy/defaults.h + * Miscellaneous default values. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_DEFAULTS_H +#define _HAPROXY_DEFAULTS_H + +/* MAX_THREADS defines the highest limit for the global nbthread value. It + * defaults to the number of bits in a long integer when threads are enabled + * but may be lowered to save resources on embedded systems. +*/ +#ifndef USE_THREAD +/* threads disabled, 1 thread max, 1 group max (note: group ids start at 1) */ +#define MAX_THREADS 1 + +#define MAX_TGROUPS 1 +#define MAX_THREADS_PER_GROUP 1 + +#else + +/* theoretical limit is 64, though we'd rather not push it too far for now + * as some structures might be enlarged to be indexed per group. Let's start + * with 16 groups max, allowing to experiment with dual-socket machines + * suffering from up to 8 loosely coupled L3 caches. It's a good start and + * doesn't engage us too far. + */ +#ifndef MAX_TGROUPS +#define MAX_TGROUPS 16 +#endif + +#define MAX_THREADS_PER_GROUP __WORDSIZE + +/* threads enabled, max_threads defaults to long bits for 1 tgroup or 4 times + * long bits if more tgroups are enabled. + */ +#ifndef MAX_THREADS +#define MAX_THREADS ((((MAX_TGROUPS) > 1) ? 4 : 1) * (MAX_THREADS_PER_GROUP)) +#endif + +#endif // USE_THREAD + +/* + * BUFSIZE defines the size of a read and write buffer. It is the maximum + * amount of bytes which can be stored by the proxy for each stream. However, + * when reading HTTP headers, the proxy needs some spare space to add or rewrite + * headers if needed. The size of this spare is defined with MAXREWRITE. So it + * is not possible to process headers longer than BUFSIZE-MAXREWRITE bytes. By + * default, BUFSIZE=16384 bytes and MAXREWRITE=min(1024,BUFSIZE/2), so the + * maximum length of headers accepted is 15360 bytes. + */ +#ifndef BUFSIZE +#define BUFSIZE 16384 +#endif + +/* certain buffers may only be allocated for responses in order to avoid + * deadlocks caused by request queuing. 2 buffers is the absolute minimum + * acceptable to ensure that a request gaining access to a server can get + * a response buffer even if it doesn't completely flush the request buffer. + * The worst case is an applet making use of a request buffer that cannot + * completely be sent while the server starts to respond, and all unreserved + * buffers are allocated by request buffers from pending connections in the + * queue waiting for this one to flush. Both buffers reserved buffers may + * thus be used at the same time. + */ +#ifndef RESERVED_BUFS +#define RESERVED_BUFS 2 +#endif + +// reserved buffer space for header rewriting +#ifndef MAXREWRITE +#define MAXREWRITE 1024 +#endif + +#ifndef REQURI_LEN +#define REQURI_LEN 1024 +#endif + +#ifndef CAPTURE_LEN +#define CAPTURE_LEN 64 +#endif + +#ifndef MAX_SYSLOG_LEN +#define MAX_SYSLOG_LEN 1024 +#endif + +/* 64kB to archive startup-logs seems way more than enough + * /!\ Careful when changing this size, it is used in a shm when exec() from + * mworker to wait mode. + */ +#ifndef STARTUP_LOG_SIZE +#define STARTUP_LOG_SIZE 65536 +#endif + +// maximum line size when parsing config +#ifndef LINESIZE +#define LINESIZE 2048 +#endif + +// max # args on a configuration line +#define MAX_LINE_ARGS 64 + +// maximum line size when parsing crt-bind-list config +#define CRT_LINESIZE 65536 + +// max # args on crt-bind-list configuration line +#define MAX_CRT_ARGS 2048 + +// max # args on a command issued on the CLI ("stats socket") +// This should cover at least 5 + twice the # of data_types +#define MAX_CLI_ARGS 64 + +// max recursion levels in config condition evaluations +// (note that binary operators add one recursion level, and +// that parenthesis may add two). +#define MAX_CFG_RECURSION 1024 + +// max # of matches per regexp +#define MAX_MATCH 10 + +// max # of headers in one HTTP request or response +// By default, about 100 headers (+1 for the first line) +#ifndef MAX_HTTP_HDR +#define MAX_HTTP_HDR 101 +#endif + +// max # of headers in history when looking for header #-X +#ifndef MAX_HDR_HISTORY +#define MAX_HDR_HISTORY 10 +#endif + +// max length of a TRACE_PRINTF() output buffer (one less char for the message) +#ifndef TRACE_MAX_MSG +#define TRACE_MAX_MSG 1024 +#endif + +// max # of stick counters per session (at least 3 for sc0..sc2) +#ifndef MAX_SESS_STKCTR +#define MAX_SESS_STKCTR 3 +#endif + +// max # of extra stick-table data types that can be registered at runtime +#ifndef STKTABLE_EXTRA_DATA_TYPES +#define STKTABLE_EXTRA_DATA_TYPES 0 +#endif + +// max # of stick-table filter entries that can be used during dump +#ifndef STKTABLE_FILTER_LEN +#define STKTABLE_FILTER_LEN 4 +#endif + +// max # of loops we can perform around a read() which succeeds. +// It's very frequent that the system returns a few TCP segments at a time. +#ifndef MAX_READ_POLL_LOOPS +#define MAX_READ_POLL_LOOPS 4 +#endif + +// minimum number of bytes read at once above which we don't try to read +// more, in order not to risk facing an EAGAIN. Most often, if we read +// at least 10 kB, we can consider that the system has tried to read a +// full buffer and got multiple segments (>1 MSS for jumbo frames, >7 MSS +// for normal frames) did not bother truncating the last segment. +#ifndef MIN_RECV_AT_ONCE_ENOUGH +#define MIN_RECV_AT_ONCE_ENOUGH (7*1448) +#endif + +// The minimum number of bytes to be forwarded that is worth trying to splice. +// Below 4kB, it's not worth allocating pipes nor pretending to zero-copy. +#ifndef MIN_SPLICE_FORWARD +#define MIN_SPLICE_FORWARD 4096 +#endif + +// the max number of events returned in one call to poll/epoll. Too small a +// value will cause lots of calls, and too high a value may cause high latency. +#ifndef MAX_POLL_EVENTS +#define MAX_POLL_EVENTS 200 +#endif + +/* eternity when exprimed in timeval */ +#ifndef TV_ETERNITY +#define TV_ETERNITY (~0UL) +#endif + +/* eternity when exprimed in ms */ +#ifndef TV_ETERNITY_MS +#define TV_ETERNITY_MS (-1) +#endif + +/* delay between boot and first time wrap, in seconds */ +#ifndef BOOT_TIME_WRAP_SEC +#define BOOT_TIME_WRAP_SEC 20 +#endif +/* we want to be able to detect time jumps. Fix the maximum wait time to a low + * value so that we know the time has changed if we wait longer. + */ +#ifndef MAX_DELAY_MS +#define MAX_DELAY_MS 60000 +#endif + +// The maximum number of connections accepted at once by a thread for a single +// listener. It used to default to 64 divided by the number of processes but +// the tasklet-based model is much more scalable and benefits from smaller +// values. Experimentation has shown that 4 gives the highest accept rate for +// all thread values, and that 3 and 5 come very close, as shown below (HTTP/1 +// connections forwarded per second at multi-accept 4 and 64): +// +// ac\thr| 1 2 4 8 16 +// ------+------------------------------ +// 4| 80k 106k 168k 270k 336k +// 64| 63k 89k 145k 230k 274k +// +#ifndef MAX_ACCEPT +#define MAX_ACCEPT 4 +#endif + +// The base max number of tasks to run at once to be used when not set by +// tune.runqueue-depth. It will automatically be divided by the square root +// of the number of threads for better fairness. As such, 64 threads will +// use 35 and a single thread will use 280. +#ifndef RUNQUEUE_DEPTH +#define RUNQUEUE_DEPTH 280 +#endif + +// cookie delimiter in "prefix" mode. This character is inserted between the +// persistence cookie and the original value. The '~' is allowed by RFC6265, +// and should not be too common in server names. +#ifndef COOKIE_DELIM +#define COOKIE_DELIM '~' +#endif + +// this delimiter is used between a server's name and a last visit date in +// cookies exchanged with the client. +#ifndef COOKIE_DELIM_DATE +#define COOKIE_DELIM_DATE '|' +#endif + +// Max number of acl() sample fetch recursive evaluations, to avoid deep tree +// loops. +#ifndef ACL_MAX_RECURSE +#define ACL_MAX_RECURSE 1000 +#endif + +#define CONN_RETRIES 3 + +#define CHK_CONNTIME 2000 +#define DEF_CHKINTR 2000 +#define DEF_MAILALERTTIME 10000 +#define DEF_FALLTIME 3 +#define DEF_RISETIME 2 +#define DEF_AGENT_FALLTIME 1 +#define DEF_AGENT_RISETIME 1 +#define DEF_CHECK_PATH "" + + +#define DEF_HANA_ONERR HANA_ONERR_FAILCHK +#define DEF_HANA_ERRLIMIT 10 + +// X-Forwarded-For header default +#define DEF_XFORWARDFOR_HDR "X-Forwarded-For" + +// X-Original-To header default +#define DEF_XORIGINALTO_HDR "X-Original-To" + +/* Max number of events that may be processed at once by + * an event_hdl API consumer to prevent thread contention. + */ +#ifndef EVENT_HDL_MAX_AT_ONCE +#define EVENT_HDL_MAX_AT_ONCE 100 +#endif + +/* Default connections limit. + * + * A system limit can be enforced at build time in order to avoid using haproxy + * beyond reasonable system limits. For this, just define SYSTEM_MAXCONN to the + * absolute limit accepted by the system. If the configuration specifies a + * higher value, it will be capped to SYSTEM_MAXCONN and a warning will be + * emitted. The only way to override this limit will be to set it via the + * command-line '-n' argument. If SYSTEM_MAXCONN is not set, a minimum value + * of 100 will be used for DEFAULT_MAXCONN which almost guarantees that a + * process will correctly start in any situation. + */ +#ifdef SYSTEM_MAXCONN +#undef DEFAULT_MAXCONN +#define DEFAULT_MAXCONN SYSTEM_MAXCONN +#elif !defined(DEFAULT_MAXCONN) +#define DEFAULT_MAXCONN 100 +#endif + +/* Define a maxconn which will be used in the master process once it re-exec to + * the MODE_MWORKER_WAIT and won't change when SYSTEM_MAXCONN is set. + * + * 100 must be enough for the master since it only does communication between + * the master and the workers, and the master CLI. + */ +#ifndef MASTER_MAXCONN +#define MASTER_MAXCONN 100 +#endif + +/* Minimum check interval for spread health checks. Servers with intervals + * greater than or equal to this value will have their checks spread apart + * and will be considered when searching the minimal interval. + * Others will be ignored for the minimal interval and will have their checks + * scheduled on a different basis. + */ +#ifndef SRV_CHK_INTER_THRES +#define SRV_CHK_INTER_THRES 1000 +#endif + +/* Specifies the string used to report the version and release date on the + * statistics page. May be defined to the empty string ("") to permanently + * disable the feature. + */ +#ifndef STATS_VERSION_STRING +#define STATS_VERSION_STRING " version " HAPROXY_VERSION ", released " HAPROXY_DATE +#endif + +/* This is the default statistics URI */ +#ifdef CONFIG_STATS_DEFAULT_URI +#define STATS_DEFAULT_URI CONFIG_STATS_DEFAULT_URI +#else +#define STATS_DEFAULT_URI "/haproxy?stats" +#endif + +/* This is the default statistics realm */ +#ifdef CONFIG_STATS_DEFAULT_REALM +#define STATS_DEFAULT_REALM CONFIG_STATS_DEFAULT_REALM +#else +#define STATS_DEFAULT_REALM "HAProxy Statistics" +#endif + +/* Maximum signal queue size, and also number of different signals we can + * handle. + */ +#ifndef MAX_SIGNAL +#define MAX_SIGNAL 256 +#endif + +/* Maximum host name length */ +#ifndef MAX_HOSTNAME_LEN +#ifdef MAXHOSTNAMELEN +#define MAX_HOSTNAME_LEN MAXHOSTNAMELEN +#else +#define MAX_HOSTNAME_LEN 64 +#endif // MAXHOSTNAMELEN +#endif // MAX_HOSTNAME_LEN + +/* Maximum health check description length */ +#ifndef HCHK_DESC_LEN +#define HCHK_DESC_LEN 128 +#endif + +/* ciphers used as defaults on connect */ +#ifndef CONNECT_DEFAULT_CIPHERS +#define CONNECT_DEFAULT_CIPHERS NULL +#endif + +/* ciphers used as defaults on TLS 1.3 connect */ +#ifndef CONNECT_DEFAULT_CIPHERSUITES +#define CONNECT_DEFAULT_CIPHERSUITES NULL +#endif + +/* ciphers used as defaults on listeners */ +#ifndef LISTEN_DEFAULT_CIPHERS +#define LISTEN_DEFAULT_CIPHERS NULL +#endif + +/* cipher suites used as defaults on TLS 1.3 listeners */ +#ifndef LISTEN_DEFAULT_CIPHERSUITES +#define LISTEN_DEFAULT_CIPHERSUITES NULL +#endif + +/* named curve used as defaults for ECDHE ciphers */ +#ifndef ECDHE_DEFAULT_CURVE +#define ECDHE_DEFAULT_CURVE "prime256v1" +#endif + +/* ssl cache size */ +#ifndef SSLCACHESIZE +#define SSLCACHESIZE 20000 +#endif + +/* ssl max dh param size */ +#ifndef SSL_DEFAULT_DH_PARAM +#define SSL_DEFAULT_DH_PARAM 0 +#endif + +/* max memory cost per SSL session */ +#ifndef SSL_SESSION_MAX_COST +#define SSL_SESSION_MAX_COST (16*1024) // measured +#endif + +/* max memory cost per SSL handshake (on top of session) */ +#ifndef SSL_HANDSHAKE_MAX_COST +#define SSL_HANDSHAKE_MAX_COST (76*1024) // measured +#endif + +#ifndef DEFAULT_SSL_CTX_CACHE +#define DEFAULT_SSL_CTX_CACHE 1000 +#endif + +/* approximate stream size (for maxconn estimate) */ +#ifndef STREAM_MAX_COST +#define STREAM_MAX_COST (sizeof(struct stream) + \ + 2 * sizeof(struct channel) + \ + 2 * sizeof(struct connection) + \ + global.tune.requri_len + \ + 2 * global.tune.cookie_len) +#endif + +/* available memory estimate : count about 3% of overhead in various structures */ +#ifndef MEM_USABLE_RATIO +#define MEM_USABLE_RATIO 0.97 +#endif + +/* if not 0, maximum allocatable memory per process in MB */ +#ifndef HAPROXY_MEMMAX +#define HAPROXY_MEMMAX 0 +#endif + +/* For USE_ZLIB, DEFAULT_MAXZLIBMEM may be set to a hard-coded value that will + * preset a maxzlibmem value. Just leave it to zero for other configurations. + * Note that it's expressed in megabytes. + */ +#if !defined(DEFAULT_MAXZLIBMEM) || !defined(USE_ZLIB) +#undef DEFAULT_MAXZLIBMEM +#define DEFAULT_MAXZLIBMEM 0 +#endif + +/* On modern architectures with many threads, a fast memory allocator, and + * local pools, the global pools with their single list can be way slower than + * the standard allocator which already has its own per-thread arenas. In this + * case we disable global pools. The global pools may still be enforced + * using CONFIG_HAP_GLOBAL_POOLS though. + */ +#if defined(USE_THREAD) && defined(HA_HAVE_FAST_MALLOC) && !defined(CONFIG_HAP_GLOBAL_POOLS) +#define CONFIG_HAP_NO_GLOBAL_POOLS +#endif + +/* default per-thread pool cache size when enabled */ +#ifndef CONFIG_HAP_POOL_CACHE_SIZE +#define CONFIG_HAP_POOL_CACHE_SIZE 524288 +#endif + +#ifndef CONFIG_HAP_POOL_CLUSTER_SIZE +#define CONFIG_HAP_POOL_CLUSTER_SIZE 8 +#endif + +/* number of bits to encode the per-pool buckets for large setups */ +#ifndef CONFIG_HAP_POOL_BUCKETS_BITS +# if defined(USE_THREAD) && MAX_THREADS >= 512 +# define CONFIG_HAP_POOL_BUCKETS_BITS 6 +# elif defined(USE_THREAD) && MAX_THREADS >= 128 +# define CONFIG_HAP_POOL_BUCKETS_BITS 5 +# elif defined(USE_THREAD) && MAX_THREADS >= 16 +# define CONFIG_HAP_POOL_BUCKETS_BITS 4 +# elif defined(USE_THREAD) +# define CONFIG_HAP_POOL_BUCKETS_BITS 3 +# else +# define CONFIG_HAP_POOL_BUCKETS_BITS 0 +# endif +#endif + +#define CONFIG_HAP_POOL_BUCKETS (1UL << (CONFIG_HAP_POOL_BUCKETS_BITS)) + +/* Number of samples used to compute the times reported in stats. A power of + * two is highly recommended, and this value multiplied by the largest response + * time must not overflow and unsigned int. See freq_ctr.h for more information. + * We consider that values are accurate to 95% with two batches of samples below, + * so in order to advertise accurate times across 1k samples, we effectively + * measure over 512. + */ +#ifndef TIME_STATS_SAMPLES +#define TIME_STATS_SAMPLES 512 +#endif + +/* max ocsp cert id asn1 encoded length */ +#ifndef OCSP_MAX_CERTID_ASN1_LENGTH +#define OCSP_MAX_CERTID_ASN1_LENGTH 128 +#endif + +#ifndef OCSP_MAX_RESPONSE_TIME_SKEW +#define OCSP_MAX_RESPONSE_TIME_SKEW 300 +#endif + +/* Number of TLS tickets to check, used for rotation */ +#ifndef TLS_TICKETS_NO +#define TLS_TICKETS_NO 3 +#endif + +/* pattern lookup default cache size, in number of entries : + * 10k entries at 10k req/s mean 1% risk of a collision after 60 years, that's + * already much less than the memory's reliability in most machines and more + * durable than most admin's life expectancy. A collision will result in a + * valid result to be returned for a different entry from the same list. + */ +#ifndef DEFAULT_PAT_LRU_SIZE +#define DEFAULT_PAT_LRU_SIZE 10000 +#endif + +/* maximum number of pollers that may be registered */ +#ifndef MAX_POLLERS +#define MAX_POLLERS 10 +#endif + +/* system sysfs directory */ +#define NUMA_DETECT_SYSTEM_SYSFS_PATH "/sys/devices/system" + +/* Number of cache trees */ +#ifndef CACHE_TREE_NUM +# if defined(USE_THREAD) +# define CACHE_TREE_NUM 8 +# else +# define CACHE_TREE_NUM 1 +# endif +#endif + +#endif /* _HAPROXY_DEFAULTS_H */ diff --git a/include/haproxy/dgram-t.h b/include/haproxy/dgram-t.h new file mode 100644 index 0000000..4e4c2af --- /dev/null +++ b/include/haproxy/dgram-t.h @@ -0,0 +1,53 @@ +/* + * include/haproxy/dgram-t.h + * This file provides structures and types for datagram processing + * + * Copyright (C) 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HAPROXY_DGRAM_T_H +#define _HAPROXY_HAPROXY_DGRAM_T_H + +#include <arpa/inet.h> + +/* + * datagram related structure + */ +struct dgram_conn { + __decl_thread(HA_SPINLOCK_T lock); + const struct dgram_data_cb *data; /* data layer callbacks. Must be set before */ + void *owner; /* pointer to upper layer's entity */ + union { /* definitions which depend on connection type */ + struct { /*** information used by socket-based dgram ***/ + int fd; /* file descriptor */ + } sock; + } t; + struct { + struct sockaddr_storage from; /* client address, or address to spoof when connecting to the server */ + struct sockaddr_storage to; /* address reached by the client, or address to connect to */ + } addr; /* addresses of the remote side, client for producer and server for consumer */ +}; + +/* + * datagram callback structure + */ +struct dgram_data_cb { + void (*recv)(struct dgram_conn *dgram); /* recv callback */ + void (*send)(struct dgram_conn *dgram); /* send callback */ +}; + +#endif /* _HAPROXY_HAPROXY_DGRAM_T_H */ diff --git a/include/haproxy/dgram.h b/include/haproxy/dgram.h new file mode 100644 index 0000000..92d00ab --- /dev/null +++ b/include/haproxy/dgram.h @@ -0,0 +1,29 @@ +/* + * include/haproxy/proto_dgram.h + * This file provides functions related to DGRAM processing. + * + * Copyright (C) 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTO_DGRAM_H +#define _HAPROXY_PROTO_DGRAM_H + +#include <haproxy/dgram-t.h> + +void dgram_fd_handler(int); + +#endif // _HAPROXY_PROTO_DGRAM_H diff --git a/include/haproxy/dict-t.h b/include/haproxy/dict-t.h new file mode 100644 index 0000000..deaa88d --- /dev/null +++ b/include/haproxy/dict-t.h @@ -0,0 +1,46 @@ +/* + * include/haproxy/dict-t.h + * Dictionaries - types definitions + * + * Copyright 2019 Frederic Lecaille <flecaille@haproxy.com> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_DICT_T_H +#define _HAPROXY_DICT_T_H + +#include <import/ebtree-t.h> +#include <haproxy/api-t.h> +#include <haproxy/thread-t.h> + +struct dict_entry { + struct ebpt_node value; + unsigned int refcount; + size_t len; +}; + +struct dict { + const char *name; + struct eb_root values; + __decl_thread(HA_RWLOCK_T rwlock); +}; + +#endif /* _HAPROXY_DICT_T_H */ diff --git a/include/haproxy/dict.h b/include/haproxy/dict.h new file mode 100644 index 0000000..635c3f1 --- /dev/null +++ b/include/haproxy/dict.h @@ -0,0 +1,36 @@ +/* + * include/haproxy/dict.h + * Dictionaries - functions prototypes + * + * Copyright 2019 Frederic Lecaille <flecaille@haproxy.com> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_DICT_H +#define _HAPROXY_DICT_H + +#include <haproxy/dict-t.h> + +struct dict *new_dict(const char *name); +struct dict_entry *dict_insert(struct dict *d, char *str); +void dict_entry_unref(struct dict *d, struct dict_entry *de); + +#endif /* _HAPROXY_DICT_H */ diff --git a/include/haproxy/dns-t.h b/include/haproxy/dns-t.h new file mode 100644 index 0000000..1c876e3 --- /dev/null +++ b/include/haproxy/dns-t.h @@ -0,0 +1,179 @@ +/* + * include/haproxy/dns-t.h + * This file provides structures and types for DNS. + * + * Copyright (C) 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_DNS_T_H +#define _HAPROXY_DNS_T_H + +#include <import/ebtree-t.h> + +#include <haproxy/connection-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/dgram-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/ring-t.h> +#include <haproxy/stats-t.h> +#include <haproxy/task-t.h> +#include <haproxy/thread.h> + +/* DNS header size */ +#define DNS_HEADER_SIZE ((int)sizeof(struct dns_header)) + +/* max pending requests per stream */ +#define DNS_STREAM_MAX_PIPELINED_REQ 4 + +#define DNS_TCP_MSG_MAX_SIZE 65535 +#define DNS_TCP_MSG_RING_MAX_SIZE (1 + 1 + 3 + DNS_TCP_MSG_MAX_SIZE) // varint_bytes(DNS_TCP_MSG_MAX_SIZE) == 3 + +/* DNS request or response header structure */ +struct dns_header { + uint16_t id; + uint16_t flags; + uint16_t qdcount; + uint16_t ancount; + uint16_t nscount; + uint16_t arcount; +} __attribute__ ((packed)); + +/* short structure to describe a DNS question */ +/* NOTE: big endian structure */ +struct dns_question { + unsigned short qtype; /* question type */ + unsigned short qclass; /* query class */ +} __attribute__ ((packed)); + + +/* NOTE: big endian structure */ +struct dns_additional_record { + uint8_t name; /* domain name, must be 0 (RFC 6891) */ + uint16_t type; /* record type DNS_RTYPE_OPT (41) */ + uint16_t udp_payload_size; /* maximum size accepted for the response */ + uint32_t extension; /* extended rcode and flags, not used for now */ + uint16_t data_length; /* data length */ +/* as of today, we don't support yet edns options, that said I already put a + * placeholder here for this purpose. We may need to define a dns_option_record + * structure which itself should point to different type of data, based on the + * extension set (client subnet, tcp keepalive, etc...)*/ +} __attribute__ ((packed)); + +/* Structure describing a name server used during name resolution. + * A name server belongs to a resolvers section. + */ +struct dns_stream_server { + struct server *srv; + struct ring *ring_req; + int max_slots; + int maxconn; + int idle_conns; + int cur_conns; + int max_active_conns; + size_t ofs_req; // ring buffer reader offset + size_t ofs_rsp; // ring buffer reader offset + struct task *task_req; /* req conn management */ + struct task *task_rsp; /* rsp management */ + struct task *task_idle; /* handle idle sess */ + struct list free_sess; + struct list idle_sess; + struct list wait_sess; + __decl_thread(HA_SPINLOCK_T lock); // lock to protect current struct +}; + +struct dns_dgram_server { + struct dgram_conn conn; /* transport layer */ + struct ring *ring_req; + size_t ofs_req; // ring buffer reader offset +}; + +struct dns_query { + struct eb32_node qid; + uint16_t original_qid; + int expire; + struct list list; +}; + +struct dns_session { + struct appctx *appctx; // appctx of current session + struct dns_stream_server *dss; + uint16_t tx_msg_offset; + int nb_queries; + int onfly_queries; + int query_counter; + struct list list; + struct list waiter; + struct list queries; + struct task *task_exp; + struct eb_root query_ids; /* tree to quickly lookup/retrieve query ids currently in use */ + size_t ofs; // ring buffer reader offset + struct ring ring; + struct { + uint16_t len; + uint16_t offset; + char *area; + } rx_msg; + unsigned char *tx_ring_area; + int shutdown; +}; + +/* Structure describing a name server + */ +struct dns_nameserver { + char *id; /* nameserver unique identifier */ + void *parent; + struct { + const char *file; /* file where the section appears */ + int line; /* line where the section appears */ + } conf; /* config information */ + + int (*process_responses)(struct dns_nameserver *ns); /* callback used to process responses */ + struct dns_dgram_server *dgram; /* used for dgram dns */ + struct dns_stream_server *stream; /* used for tcp dns */ + + EXTRA_COUNTERS(extra_counters); + struct dns_counters *counters; + + struct list list; /* nameserver chained list */ +}; + +/* mixed dns and resolver counters, we will have to split them */ +struct dns_counters { + char *id; + char *pid; + long long sent; /* - queries sent */ + long long snd_error; /* - sending errors */ + union { + struct { + long long valid; /* - valid response */ + long long update; /* - valid response used to update server's IP */ + long long cname; /* - CNAME response requiring new resolution */ + long long cname_error; /* - error when resolving CNAMEs */ + long long any_err; /* - void response (usually because ANY qtype) */ + long long nx; /* - NX response */ + long long timeout; /* - queries which reached timeout */ + long long refused; /* - queries refused */ + long long other; /* - other type of response */ + long long invalid; /* - malformed DNS response */ + long long too_big; /* - too big response */ + long long outdated; /* - outdated response (server slower than the other ones) */ + long long truncated; /* - truncated response */; + } resolver; + } app; /* application specific counteurs */ +}; + +#endif /* _HAPROXY_DNS_T_H */ diff --git a/include/haproxy/dns.h b/include/haproxy/dns.h new file mode 100644 index 0000000..84181c4 --- /dev/null +++ b/include/haproxy/dns.h @@ -0,0 +1,33 @@ +/* + * include/haproxy/dns.h + * This file provides functions related to DNS protocol + * + * Copyright (C) 2020 HAProxy Technologies + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_DNS_H +#define _HAPROXY_DNS_H + +#include <haproxy/dns-t.h> +#include <haproxy/server-t.h> + +int dns_send_nameserver(struct dns_nameserver *ns, void *buf, size_t len); +ssize_t dns_recv_nameserver(struct dns_nameserver *ns, void *data, size_t size); +int dns_dgram_init(struct dns_nameserver *ns, struct sockaddr_storage *sk); +int dns_stream_init(struct dns_nameserver *ns, struct server *s); + +#endif // _HAPROXY_DNS_H diff --git a/include/haproxy/dynbuf-t.h b/include/haproxy/dynbuf-t.h new file mode 100644 index 0000000..b5545ab --- /dev/null +++ b/include/haproxy/dynbuf-t.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/dynbuf-t.h + * Structure definitions for dynamic buffer management. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_DYNBUF_T_H +#define _HAPROXY_DYNBUF_T_H + + +/* an element of the <buffer_wq> list. It represents an object that need to + * acquire a buffer to continue its process. */ +struct buffer_wait { + void *target; /* The waiting object that should be woken up */ + int (*wakeup_cb)(void *); /* The function used to wake up the <target>, passed as argument */ + struct list list; /* Next element in the <buffer_wq> list */ +}; + +#endif /* _HAPROXY_DYNBUF_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/dynbuf.h b/include/haproxy/dynbuf.h new file mode 100644 index 0000000..a89800c --- /dev/null +++ b/include/haproxy/dynbuf.h @@ -0,0 +1,131 @@ +/* + * include/haproxy/dynbuf.h + * Buffer management functions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_DYNBUF_H +#define _HAPROXY_DYNBUF_H + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <import/ist.h> +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/dynbuf-t.h> +#include <haproxy/pool.h> + +extern struct pool_head *pool_head_buffer; + +int init_buffer(void); +void buffer_dump(FILE *o, struct buffer *b, int from, int to); + +/*****************************************************************/ +/* These functions are used to compute various buffer area sizes */ +/*****************************************************************/ + +/* Return 1 if the buffer has less than 1/4 of its capacity free, otherwise 0 */ +static inline int buffer_almost_full(const struct buffer *buf) +{ + if (b_is_null(buf)) + return 0; + + return b_almost_full(buf); +} + +/**************************************************/ +/* Functions below are used for buffer allocation */ +/**************************************************/ + +/* Ensures that <buf> is allocated, or allocates it. If no memory is available, + * ((char *)1) is assigned instead with a zero size. The allocated buffer is + * returned, or NULL in case no memory is available. Since buffers only contain + * user data, poisonning is always disabled as it brings no benefit and impacts + * performance. Due to the difficult buffer_wait management, they are not + * subject to forced allocation failures either. + */ +#define b_alloc(_buf) \ +({ \ + char *_area; \ + struct buffer *_retbuf = _buf; \ + \ + if (!_retbuf->size) { \ + *_retbuf = BUF_WANTED; \ + _area = pool_alloc_flag(pool_head_buffer, POOL_F_NO_POISON | POOL_F_NO_FAIL); \ + if (unlikely(!_area)) { \ + activity[tid].buf_wait++; \ + _retbuf = NULL; \ + } \ + else { \ + _retbuf->area = _area; \ + _retbuf->size = pool_head_buffer->size; \ + } \ + } \ + _retbuf; \ + }) + +/* Releases buffer <buf> (no check of emptiness). The buffer's head is marked + * empty. + */ +#define __b_free(_buf) \ + do { \ + char *area = (_buf)->area; \ + \ + /* let's first clear the area to save an occasional "show sess all" \ + * glancing over our shoulder from getting a dangling pointer. \ + */ \ + *(_buf) = BUF_NULL; \ + __ha_barrier_store(); \ + pool_free(pool_head_buffer, area); \ + } while (0) \ + +/* Releases buffer <buf> if allocated, and marks it empty. */ +#define b_free(_buf) \ + do { \ + if ((_buf)->size) \ + __b_free((_buf)); \ + } while (0) + +/* Offer one or multiple buffer currently belonging to target <from> to whoever + * needs one. Any pointer is valid for <from>, including NULL. Its purpose is + * to avoid passing a buffer to oneself in case of failed allocations (e.g. + * need two buffers, get one, fail, release it and wake up self again). In case + * of normal buffer release where it is expected that the caller is not waiting + * for a buffer, NULL is fine. It will wake waiters on the current thread only. + */ +void __offer_buffers(void *from, unsigned int count); + +static inline void offer_buffers(void *from, unsigned int count) +{ + if (!LIST_ISEMPTY(&th_ctx->buffer_wq)) + __offer_buffers(from, count); +} + + +#endif /* _HAPROXY_DYNBUF_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/errors.h b/include/haproxy/errors.h new file mode 100644 index 0000000..c102fed --- /dev/null +++ b/include/haproxy/errors.h @@ -0,0 +1,139 @@ +/* + * include/haproxy/errors.h + * Global error macros and constants + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_ERRORS_H +#define _HAPROXY_ERRORS_H + +#include <stdarg.h> +#include <stdio.h> + +#include <haproxy/buf-t.h> +#include <haproxy/obj_type-t.h> + +/* These flags may be used in various functions which are called from within + * loops (eg: to start all listeners from all proxies). They provide enough + * information to let the caller decide what to do. ERR_WARN and ERR_ALERT + * do not indicate any error, just that a message has been put in a shared + * buffer in order to be displayed by the caller. + */ +#define ERR_NONE 0x00 /* no error, no message returned */ +#define ERR_RETRYABLE 0x01 /* retryable error, may be cumulated */ +#define ERR_FATAL 0x02 /* fatal error, may be cumulated */ +#define ERR_ABORT 0x04 /* it's preferable to end any possible loop */ +#define ERR_WARN 0x08 /* a warning message has been returned */ +#define ERR_ALERT 0x10 /* an alert message has been returned */ + +#define ERR_CODE (ERR_RETRYABLE|ERR_FATAL|ERR_ABORT) /* mask */ + +extern struct ring *startup_logs; + +/* These codes may be used by config parsing functions which detect errors and + * which need to inform the upper layer about them. They are all prefixed with + * "PE_" for "Parse Error". These codes will probably be extended, and functions + * making use of them should be documented as such. Only code PE_NONE (zero) may + * indicate a valid condition, all other ones must be caught as errors, event if + * unknown by the caller. This must not be used to forward warnings. + */ +enum { + PE_NONE = 0, /* no error */ + PE_ENUM_OOR, /* enum data out of allowed range */ + PE_EXIST, /* trying to create something which already exists */ + PE_ARG_MISSING, /* mandatory argument not provided */ + PE_ARG_NOT_USED, /* argument provided cannot be used */ + PE_ARG_INVC, /* invalid char in argument (pointer not provided) */ + PE_ARG_INVC_PTR, /* invalid char in argument (pointer provided) */ + PE_ARG_NOT_FOUND, /* argument references something not found */ + PE_ARG_VALUE_OOR, /* argument value is out of range */ +}; + + +void usermsgs_clr(const char *prefix); +int usermsgs_empty(void); +const char *usermsgs_str(void); +extern uint tot_warnings; + +/************ Error reporting functions ***********/ + +struct usermsgs_ctx { + struct buffer str; + + const char *prefix; /* prefix of every output */ + const char *file; /* related filename for config parsing */ + int line; /* related line number for config parsing */ + enum obj_type *obj; /* related proxy, server, ... */ +}; +void set_usermsgs_ctx(const char *file, int line, enum obj_type *obj); +void register_parsing_obj(enum obj_type *obj); +void reset_usermsgs_ctx(void); + +/* + * Displays the message on stderr with the date and pid. Overrides the quiet + * mode during startup. + */ +void ha_alert(const char *fmt, ...) + __attribute__ ((format(printf, 1, 2))); + +/* + * Displays the message on stderr with the date and pid. + */ +void ha_warning(const char *fmt, ...) + __attribute__ ((format(printf, 1, 2))); + +/* + * These functions are reserved to output diagnostics on MODE_DIAG. + * Use the underscore variants only if MODE_DIAG has already been checked. + */ +void _ha_vdiag_warning(const char *fmt, va_list argp); +void _ha_diag_warning(const char *fmt, ...); +void ha_diag_warning(const char *fmt, ...) + __attribute__ ((format(printf, 1 ,2))); + +/* Check for both MODE_DIAG and <cond> before outputting a diagnostic warning */ +#define HA_DIAG_WARNING_COND(cond, fmt, ...) \ + do { \ + if ((global.mode & MODE_DIAG) && (cond)) \ + _ha_diag_warning((fmt), ##__VA_ARGS__); \ + } while (0) + +/* + * Displays the message on stderr with the date and pid. + */ +void ha_notice(const char *fmt, ...) + __attribute__ ((format(printf, 1, 2))); + +/* + * Displays the message on <out> only if quiet mode is not set. + */ +void qfprintf(FILE *out, const char *fmt, ...) + __attribute__ ((format(printf, 2, 3))); + +void startup_logs_init(); +struct ring *startup_logs_dup(struct ring *src); +void startup_logs_free(struct ring *r); + +#endif /* _HAPROXY_ERRORS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/event_hdl-t.h b/include/haproxy/event_hdl-t.h new file mode 100644 index 0000000..d499852 --- /dev/null +++ b/include/haproxy/event_hdl-t.h @@ -0,0 +1,295 @@ +/* + * include/haproxy/event_hdl-t.h + * event handlers management definitions + * + * Copyright 2022 HAProxy Technologies + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_EVENT_HDL_T_H +# define _HAPROXY_EVENT_HDL_T_H + +#include <stdint.h> +#include <sys/time.h> + +#include <haproxy/api-t.h> + +/* event data struct are defined as followed */ +struct event_hdl_cb_data_template { + struct { + /* safe data can be safely used from both + * sync and async handlers + * data consistency is guaranteed + */ + } safe; + struct { + /* unsafe data may only be used from sync handlers: + * in async mode, data consistency cannot be guaranteed + * and unsafe data may already be stale, thus using + * it is highly discouraged because it + * could lead to undefined behavior (UAF, null dereference...) + */ + } unsafe; +}; + +/* event_hdl tunables */ +struct event_hdl_tune { + unsigned int max_events_at_once; +}; + +/* FIXME: adjust if needed! Should be large enough + * to support every struct event_hdl_cb_data_x types + * BUG_ON check in publish/async_mode and static assert + * in EVENT_HDL_CB_DATA will ensure this + */ +#define EVENT_HDL_ASYNC_EVENT_DATA (768) +/* used internally to store a single copy of event data when dealing with + * async handlers. + * The same copy can be provided to multiple handlers to prevent memory waste: + * refcount is used to keep track of references so that + * data can be freed when not used anymore + */ +typedef void (*event_hdl_data_free)(const void *data); +struct event_hdl_async_event_data +{ + /* internal storage */ + char data[EVENT_HDL_ASYNC_EVENT_DATA]; + /* user-provided free function if event data relies on + * dynamic members that require specific cleanup + */ + event_hdl_data_free mfree; + uint32_t refcount; +}; + +/* type for storing event subscription type */ +struct event_hdl_sub_type +{ + /* up to 256 families, non cumulative, adjust if needed */ + uint8_t family; + /* up to 16 sub types using bitmasks, adjust if needed */ + uint16_t subtype; +}; + +struct event_hdl_sub_list_head { + struct mt_list head; + struct mt_list known; /* api uses this to track known subscription lists */ +}; + +/* event_hdl_sub_list is an alias (please use this for portability) */ +typedef struct event_hdl_sub_list_head event_hdl_sub_list; + +struct event_hdl_async_equeue_head { + struct mt_list head; + uint32_t size; /* near realtime size, not fully synced with head (to be used as a hint) */ +}; + +/* event_hdl_async_equeue is an alias to mt_list (please use this for portability) */ +typedef struct event_hdl_async_equeue_head event_hdl_async_equeue; + +/* subscription mgmt from event */ +struct event_hdl_sub_mgmt +{ + /* manage subscriptions from event + * this must not be used directly because locking might be required + */ + struct event_hdl_sub *this; + /* safe functions than can be used from event context (sync and async mode) */ + struct event_hdl_sub_type (*getsub)(const struct event_hdl_sub_mgmt *); + int (*resub)(const struct event_hdl_sub_mgmt *, struct event_hdl_sub_type); + void (*unsub)(const struct event_hdl_sub_mgmt *); +}; + +/* single event structure pushed into async event queue + * used by tasks async handlers + */ +struct event_hdl_async_event +{ + struct mt_list mt_list; + struct event_hdl_sub_type type; + /* data wrapper - should not be used directly */ + struct event_hdl_async_event_data *_data; + /* for easy data access, + * points to _data->data if data is available + */ + void *data; + void *private; + struct timeval when; + struct event_hdl_sub_mgmt sub_mgmt; +}; + +/* internal structure provided to function event_hdl_publish() + * It contains ptr to data relevant to the event + */ +struct event_hdl_cb_data { + /* internal use: ptr to struct event_hdl_cb_data_type */ + void *_ptr; + /* internal use: holds actual data size*/ + size_t _size; + /* user specified freeing function for event_hdl_cb_data_type + * struct members + */ + event_hdl_data_free _mfree; +}; + +/* struct provided to event_hdl_cb_* handlers + * contains data related to the event + * that triggered the handler + */ +struct event_hdl_cb +{ + /* event type */ + struct event_hdl_sub_type e_type; + /* event data */ + void *e_data; + /* manage the subscription responsible for handing the event to us */ + const struct event_hdl_sub_mgmt *sub_mgmt; + + /* may be used by sync event handler to ensure + * it runs in sync mode, and thus is eligible to access unsafe data. + * This could save the day when users are copy-pasting function + * logic from a sync handler to an async handler without + * taking appropriate precautions and unsafe accesses are performed. + * (See EVENT_HDL_ASSERT_SYNC macro API helper) + */ + uint8_t _sync; +}; + +/* prototype for event_hdl_cb_sync function pointer */ +typedef void (*event_hdl_cb_sync)(const struct event_hdl_cb *cb, void *private); +/* prototype for event_hdl_cb async function pointer */ +typedef void (*event_hdl_cb_async)(const struct event_hdl_cb *cb, void *private); +/* prototype for event_hdl_private_free function pointer */ +typedef void (*event_hdl_private_free)(void *private); + +/* tasklet forward declaration */ +struct tasklet; +/* enum for sync mode */ +enum event_hdl_async_mode +{ + EVENT_HDL_ASYNC_MODE_NORMAL = 1, + EVENT_HDL_ASYNC_MODE_ADVANCED = 2 +}; + +/* event hdl, used when subscribing (and then associated with a subscription) */ +struct event_hdl { + /* optional unique id (hash) for lookup */ + uint64_t id; + /* handler debug: origin (initial event subscription calling place) */ + const char *dorigin; + /* handler requires async mode: + * EVENT_HDL_ASYNC_MODE_NORMAL = normal + * EVENT_HDL_ASYNC_MODE_ADVANCED = advanced, single task wakeup + */ + uint8_t async; + + union { + event_hdl_cb_sync sync_ptr; /* if !async */ + event_hdl_cb_async async_ptr; /* only used if async==1 (normal) */ + }; + + /* ptr to async task responsible for consuming events */ + struct tasklet *async_task; + /* used by async tasks to consume pending events */ + event_hdl_async_equeue *async_equeue; + /* function ptr automatically called by: + * async task when hdl is unregistered and private is no longer referenced + * sync context when unregistering is performed + */ + event_hdl_private_free private_free; + /* it is not safe to assume that private will not + * be used anymore once hdl is unregistered: + * with async handlers, private could still be referenced + * in pending events to be consumed later by the task (by design). + * If freeing private is needed, you must provide async_private_free + * function pointer when registering. + * It will be called when private is no longer used + * after unregistering hdl to perform private cleanup. + * (please use this even in sync mode so that subscription + * can easily be turned into async mode later without breaking stuff) + */ + void *private; +}; + +/* flags for event_hdl_sub struct (32 bits) */ +#define EHDL_SUB_F_PAUSED 0x0001 /* subscription will temporarily ignore events */ + +/* list elem: subscription (handler subscribed to specific events) + */ +struct event_hdl_sub { + struct mt_list mt_list; + /* event type subscription */ + struct event_hdl_sub_type sub; + uint32_t flags; + /* event handler */ + struct event_hdl hdl; + /* used to guarantee that END event will be delivered + * (memory is allocated when registering, no memory failure can occur at runtime) + */ + struct event_hdl_async_event *async_end; + /* > 0 : subscription is referenced, don't free yet + * use atomic OPS to write and read from it + */ + uint32_t refcount; + /* TODO: atomic_call_counter for stats?! */ +}; + +#define ESUB_INDEX(n) (1 << (n - 1)) + +#define EVENT_HDL_SUB_TYPE(_family, _type) ((struct event_hdl_sub_type){ .family = _family, .subtype = ESUB_INDEX(_type) }) +#define EVENT_HDL_SUB_FAMILY(_family) ((struct event_hdl_sub_type){ .family = _family, .subtype = ~0 }) + +#define EVENT_HDL_SUB_NONE ((struct event_hdl_sub_type){ .family = 0, .subtype = 0}) +/* for async tasks: subscription is ending */ +#define EVENT_HDL_SUB_END ((struct event_hdl_sub_type){ .family = 0, .subtype = 1}) + +/* --------------------------------------- */ + +/* user defined event types are listed here + * please reflect any change in these macros in the subtype map + * defined below that is used to perform string to event type and + * event type to string conversions + */ + +/* TODO */ + +/* SERVER FAMILY, provides event_hdl_cb_data_server struct + * (will be defined in haproxy/server-t.h) + */ +#define EVENT_HDL_SUB_SERVER EVENT_HDL_SUB_FAMILY(1) +#define EVENT_HDL_SUB_SERVER_ADD EVENT_HDL_SUB_TYPE(1,1) +#define EVENT_HDL_SUB_SERVER_DEL EVENT_HDL_SUB_TYPE(1,2) +#define EVENT_HDL_SUB_SERVER_UP EVENT_HDL_SUB_TYPE(1,3) +#define EVENT_HDL_SUB_SERVER_DOWN EVENT_HDL_SUB_TYPE(1,4) +/* server state change */ +#define EVENT_HDL_SUB_SERVER_STATE EVENT_HDL_SUB_TYPE(1,5) +/* server admin change */ +#define EVENT_HDL_SUB_SERVER_ADMIN EVENT_HDL_SUB_TYPE(1,6) +/* server check-related (agent or health) event */ +#define EVENT_HDL_SUB_SERVER_CHECK EVENT_HDL_SUB_TYPE(1,7) +/* server inet addr (addr:svc_port tuple) change event */ +#define EVENT_HDL_SUB_SERVER_INETADDR EVENT_HDL_SUB_TYPE(1,8) + +/* --------------------------------------- */ + +/* Please reflect changes above in event_hdl_sub_type_map defined + * in event_hdl.c file + */ +struct event_hdl_sub_type_map { + const char *name; + struct event_hdl_sub_type type; +}; + +#endif /* _HAPROXY_EVENT_HDL_T_H */ diff --git a/include/haproxy/event_hdl.h b/include/haproxy/event_hdl.h new file mode 100644 index 0000000..5a7ee66 --- /dev/null +++ b/include/haproxy/event_hdl.h @@ -0,0 +1,512 @@ +/* + * include/haproxy/event_hdl.h + * event handlers management + * + * Copyright 2022 HAProxy Technologies + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_EVENT_HDL_H +# define _HAPROXY_EVENT_HDL_H + +#include <haproxy/event_hdl-t.h> +#include <haproxy/list.h> + +/* preprocessor trick to extract function calling place + * __FILE__:__LINE__ + */ +#define _EVENT_HDL_CALLING_PLACE2(line) #line +#define _EVENT_HDL_CALLING_PLACE1(line) _EVENT_HDL_CALLING_PLACE2(line) +#define _EVENT_HDL_CALLING_PLACE __FILE__":"_EVENT_HDL_CALLING_PLACE1(__LINE__) + +/* ------ PUBLIC EVENT_HDL API ------ */ + +/* You will find a lot of useful information/comments in this file, but if you're looking + * for a step by step documentation please check out 'doc/internals/api/event_hdl.txt' + */ + +/* Note: API helper macros are used in this file to make event_hdl functions usage + * simpler, safer and more consistent between sync mode and async mode + */ + +/* ======================================= EVENT_HDL_SYNC handlers ===================================== + * must be used only with extreme precautions + * sync handlers are directly called under the function that published the event. + * Hence, all the processing done within such function will impact the caller. + * + * For this reason, you must be extremely careful when using sync mode, because trying to lock something + * that is already held by the caller, or depending on something external to the current thread will + * prevent the caller from running. + * + * Please consider using async handlers in this case, they are specifically made to solve this limitation. + * + * On the other hand, sync handlers are really useful when you directly depend on callers' provided data + * (example: pointer to data) or you need to perform something before the caller keeps going. + * A good example could be a cleanup function that will take care of freeing data, closing fds... related + * to event data before caller's flow keeps going (interrupting the process while dealing with the event). + */ + + +/* ===================================== EVENT_HDL_ASYNC handlers ====================================== + * async handlers are run in independent tasks, so that the caller (that published the event) can safely + * return to its own processing. + * + * async handlers may access safe event data safely with guaranteed consistency. + */ + + +/* ================================ IDENTIFIED vs ANONYMOUS EVENT_HDL ================================= + * When registering a sync or async event handler, you are free to provide a unique identifier (hash). + * + * id can be computed using event_hdl_id function. + * + * Not providing an id results in the subscription being considered as anonymous subscription. + * 0 is not a valid identifier (should be > 0) + * + * Identified subscription is guaranteed to be unique for a given subscription list, + * whereas anonymous subscriptions don't provide such guarantees. + * + * Identified subscriptions provide the ability to be later queried or unregistered from external code + * using dedicated id/hash for the lookups. + * + * On the other hand, anonymous subscriptions don't, the only other way to reference an anonymous subscription + * is to use a subscription pointer. + * + */ + +/* general purpose hashing function when you want to compute + * an ID based on <scope> x <name> + * It is your responsibility to make sure <scope> is not used + * elsewhere in the code (or that you are fine with sharing + * the scope). + */ +uint64_t event_hdl_id(const char *scope, const char *name); + +/* ------ EVENT SUBSCRIPTIONS FUNCTIONS ------ */ + +/* macro helper: + * sync version + * + * identified subscription + * + * <_id>: subscription id that could be used later + * to perform subscription lookup by id + * <func>: pointer to 'event_hdl_cb_sync' prototyped function + * <_private>: pointer to private data that will be handled to <func> + * <_private_free>: pointer to 'event_hdl_private_free' prototyped function + * that will be called with <private> when unsubscription is performed + */ +#define EVENT_HDL_ID_SYNC(_id, func, _private, _private_free) \ + (struct event_hdl){ .id = _id, \ + .dorigin = _EVENT_HDL_CALLING_PLACE, \ + .async = 0, \ + .sync_ptr = func, \ + .private = _private, \ + .private_free = _private_free } + +/* macro helper: + * sync version + * + * anonymous subscription (no lookup by id) + * + * <func>: pointer to 'event_hdl_cb_sync' prototyped function + * <_private>: pointer to private data that will be handled to <func> + * <_private_free>: pointer to 'event_hdl_private_free' prototyped function + * that will be called with <private> when unsubscription is performed + */ +#define EVENT_HDL_SYNC(func, _private, _private_free) \ + EVENT_HDL_ID_SYNC(0, func, _private, _private_free) + +/* macro helper: + * async version + * + * identified subscription + * + * <_id>: subscription id that could be used later + * to perform subscription lookup by id + * <func>: pointer to 'event_hdl_cb_sync' prototyped function + * <_private>: pointer to private data that will be handled to <func> + * <_private_free>: pointer to 'event_hdl_private_free' prototyped function + * that will be called with <private> after unsubscription is performed, + * when no more events can refer to <private>. + */ +#define EVENT_HDL_ID_ASYNC(_id, func, _private, _private_free) \ + (struct event_hdl){ .id = _id, \ + .dorigin = _EVENT_HDL_CALLING_PLACE, \ + .async = EVENT_HDL_ASYNC_MODE_NORMAL, \ + .async_ptr = func, \ + .private = _private, \ + .private_free = _private_free } + +/* macro helper: + * async version + * + * anonymous subscription (no lookup by id) + * + * <func>: pointer to 'event_hdl_cb_sync' prototyped function + * <_private>: pointer to private data that will be handled to <func> + * <_private_free>: pointer to 'event_hdl_private_free' prototyped function + * that will be called with <private> after unsubscription is performed, + * when no more events can refer to <private>. + */ +#define EVENT_HDL_ASYNC(func, _private, _private_free) \ + EVENT_HDL_ID_ASYNC(0, func, _private, _private_free) + +/* macro helper: + * async version + * same than EVENT_HDL_ID_ASYNC - advanced mode: + * you directly provide task and event_queue list. + * + * identified subscription + * + * <_id>: subscription id that could be used later + * to perform subscription lookup by id + * <equeue>: pointer to event_hdl_async_event queue where the pending + * events will be pushed. Cannot be NULL. + * <task>: pointer to task(let) responsible for consuming the events. +* Cannot be NULL. + * <_private>: pointer to private data that will be handled to <func> + * <_private_free>: pointer to 'event_hdl_private_free' prototyped function + * that will be called with <private> after unsubscription is performed, + * when no more events can refer to <private>. + */ +#define EVENT_HDL_ID_ASYNC_TASK(_id, equeue, task, _private, _private_free) \ + (struct event_hdl){ .id = _id, \ + .dorigin = _EVENT_HDL_CALLING_PLACE, \ + .async = EVENT_HDL_ASYNC_MODE_ADVANCED, \ + .async_task = (struct tasklet *)task, \ + .async_equeue = equeue, \ + .private = _private, \ + .private_free = _private_free } + +/* macro helper: + * async version + * same than EVENT_HDL_ASYNC - advanced mode: + * you directly provide task and event_queue list. + * + * anonymous subscription (no lookup by id) + * + * <equeue>: pointer to event_hdl_async_event queue where the pending + * events will be pushed. Cannot be NULL. + * <task>: pointer to task(let) responsible for consuming the events + * Cannot be NULL. + * <_private>: pointer to private data that will be handled to <func> + * <_private_free>: pointer to 'event_hdl_private_free' prototyped function + * that will be called with <private> after unsubscription is performed, + * when no more events can refer to <private>. + */ +#define EVENT_HDL_ASYNC_TASK(equeue, task, _private, _private_free) \ + EVENT_HDL_ID_ASYNC_TASK(0, equeue, task, _private, _private_free) + +/* register a new event subscription in <sub_list> + * that will handle <e_type> events + * + * This function requires you to use + * EVENT_HDL_(TASK_)(A)SYNC() EVENT_HDL_ID_(TASK_)(A)SYNC() (choose wisely) + * macro helpers to provide <hdl> argument + * + * If <sub_list> is not specified (equals NULL): + * global subscription list (process wide) will be used. + * + * For identified subscriptions (EVENT_HDL_ID_*), the function is safe against + * concurrent subscriptions attempts with the same ID: the ID will only be + * inserted once in the list and subsequent attempts will yield an error. + * However, trying to register the same ID multiple times is considered as + * an error (no specific error code is returned in this case) so the check should + * be performed by the caller if it is expected. (The caller must ensure that the ID + * is unique to prevent the error from being raised) + * + * Returns 1 in case of success, 0 in case of failure (invalid argument / memory error) + */ +int event_hdl_subscribe(event_hdl_sub_list *sub_list, + struct event_hdl_sub_type e_type, struct event_hdl hdl); + +/* same as event_hdl_subscribe, but + * returns the subscription ptr in case of success + * or NULL in case of failure + * subscription refcount is automatically incremented by 1 + * so that ptr remains valid while you use it. + * You must call event_hdl_drop() when you no longer + * use it or event_hdl_unsubscribe() to unregister the + * subscription + */ +struct event_hdl_sub *event_hdl_subscribe_ptr(event_hdl_sub_list *sub_list, + struct event_hdl_sub_type e_type, struct event_hdl hdl); + +/* update subscription type: + * if new type family does not match current family, does nothing + * only subtype update is supported + * Returns 1 for SUCCESS and 0 for FAILURE (update not supported) + */ +int event_hdl_resubscribe(struct event_hdl_sub *cur_sub, struct event_hdl_sub_type type); + +/* unregister an existing subscription <sub> + * will automatically call event_hdl_drop() + */ +void event_hdl_unsubscribe(struct event_hdl_sub *sub); + +/* decrease subscription refcount by 1 + * use this when you no longer use sub ptr + * provided by event_hdl_subscribe_ptr or + * to cancel previous event_hdl_take() + */ +void event_hdl_drop(struct event_hdl_sub *sub); + +/* increase subscription refcount by 1 + * event_hdl_drop is needed when ptr + * is not longer used + * or event_hdl_unsubscribe to end the subscription + */ +void event_hdl_take(struct event_hdl_sub *sub); + +/* ------ EVENT_HDL_LOOKUP: subscription lookup operations from external code ------ */ + +/* use this function to unregister the subscription <lookup_ip> + * within <sub_list> list. + * If <sub_list> is NULL, global subscription list will be used. + * Returns 1 for SUCCESS and 0 if not found + */ +int event_hdl_lookup_unsubscribe(event_hdl_sub_list *sub_list, + uint64_t lookup_id); + +/* use this function to update subscription by <lookup_id> within <sub_list> list + * if new type family does not match current family, does nothing + * only subtype update is supported + * If <sub_list> is NULL, global subscription list will be used. + * Returns 1 for SUCCESS and 0 if not found or not supported + */ +int event_hdl_lookup_resubscribe(event_hdl_sub_list *sub_list, + uint64_t lookup_id, struct event_hdl_sub_type type); + +/* use this function to get a new reference ptr to the subscription + * identified by <id> + * or event_hdl_unsubscribe to end the subscription + * If <sub_list> is NULL, global subscription list will be used. + * returns NULL if not found + * returned ptr should be called with event_hdl_drop when no longer used + */ +struct event_hdl_sub *event_hdl_lookup_take(event_hdl_sub_list *sub_list, + uint64_t lookup_id); + +/* pause an existing subscription <sub> + * the subscription will no longer receive events (reversible) + * This can be reverted thanks to _resume() function + */ +void event_hdl_pause(struct event_hdl_sub *sub); + +/* resume an existing subscription <sub> + * that was previously paused using _pause() function + */ +void event_hdl_resume(struct event_hdl_sub *sub); + +/* Same as event_hdl_pause() for identified subscriptions: + * use this function to pause the subscription <lookup_ip> + * within <sub_list> list. + * If <sub_list> is NULL, global subscription list will be used. + * Returns 1 for SUCCESS and 0 if not found + */ +int event_hdl_lookup_pause(event_hdl_sub_list *sub_list, + uint64_t lookup_id); + +/* Same as event_hdl_resume() for identified subscriptions: + * use this function to resume the subscription <lookup_ip> + * within <sub_list> list. + * If <sub_list> is NULL, global subscription list will be used. + * Returns 1 for SUCCESS and 0 if not found + */ +int event_hdl_lookup_resume(event_hdl_sub_list *sub_list, + uint64_t lookup_id); + +/* ------ PUBLISHING FUNCTIONS ------ */ + +/* this macro is provided as an internal helper to automatically populate + * data for fixed length structs as required by event_hdl publish function + */ +#define _EVENT_HDL_CB_DATA_ASSERT(size) \ + ({ \ + /* if this fails to compile \ + * it means you need to fix \ + * EVENT_HDL_ASYNC_EVENT_DATA \ + * size in event_hdl-t.h \ + */ \ + __attribute__((unused)) \ + char __static_assert[(size <= EVENT_HDL_ASYNC_EVENT_DATA) ? 1 : -1];\ + (size); \ + }) +#define _EVENT_HDL_CB_DATA(data,size,mfree) \ + (&(struct event_hdl_cb_data){ ._ptr = data, \ + ._size = size, \ + ._mfree = mfree }) + +/* Use this when 'safe' data is completely standalone */ +#define EVENT_HDL_CB_DATA(data) \ + _EVENT_HDL_CB_DATA(data, \ + _EVENT_HDL_CB_DATA_ASSERT(sizeof(*data)), \ + NULL) +/* Use this when 'safe' data points to dynamically allocated members + * that require freeing when the event is completely consumed + * (data in itself may be statically allocated as with + * EVENT_HDL_CB_DATA since the publish function will take + * care of copying it for async handlers) + * + * mfree function will be called with data as argument + * (or copy of data in async context) when the event is completely + * consumed (sync and async handlers included). This will give you + * enough context to perform the required cleanup steps. + * + * mfree should be prototyped like this: + * void (*mfree)(const void *data) + */ +#define EVENT_HDL_CB_DATA_DM(data, mfree) \ + _EVENT_HDL_CB_DATA(data, \ + _EVENT_HDL_CB_DATA_ASSERT(sizeof(*data)), \ + mfree) + +/* event publishing function + * this function should be called from anywhere in the code to notify + * about an <e_type> and provide some relevant <data> + * that will be provided to subscriptions in <sub_list> + * that are subscribed to <e_type>. + * <data> should be provided using EVENT_HDL_CB_DATA helper macro + * + * Example: + * struct event_hdl_cb_data_server cb_data; + * + * /... + * cb_data initialization + * .../ + * + * event_hdl_publish(NULL, EVENT_HDL_SUB_SERVER_UP, EVENT_HDL_CB_DATA(&cb_data)); + */ +int event_hdl_publish(event_hdl_sub_list *sub_list, + struct event_hdl_sub_type e_type, const struct event_hdl_cb_data *data); + +/* ------ MISC/HELPER FUNCTIONS ------ */ + +/* returns a statically allocated string that is + * the printable representation of <sub_type> + * or "N/A" if <sub_type> does not exist + */ +const char *event_hdl_sub_type_to_string(struct event_hdl_sub_type sub_type); + +/* returns the internal sub_type corresponding + * to the printable representation <name> + * or EVENT_HDL_SUB_NONE if no such event exists + * (see event_hdl-t.h for the complete list of supported types) + */ +struct event_hdl_sub_type event_hdl_string_to_sub_type(const char *name); + +/* Use this from sync hdl to ensure the function is executed + * in sync mode (and thus unsafe data is safe to use from this ctx) + * This macro is meant to prevent unsafe data access + * if code from sync function is copy pasted into + * async function (or if sync handler is changed + * to async handler without adapting the code) + * FIXME: do we BUG_ON, or simply warn and return from the function? + */ +#define EVENT_HDL_ASSERT_SYNC(cb) BUG_ON(!cb->_sync) + +/* check if a and b sub types are part of the same family */ +static inline int event_hdl_sub_family_equal(struct event_hdl_sub_type a, struct event_hdl_sub_type b) +{ + return (a.family == b.family); +} + +/* compares 2 event_hdl_sub_type structs + * returns 1 if equal, 0 if not equal + */ +static inline int event_hdl_sub_type_equal(struct event_hdl_sub_type a, struct event_hdl_sub_type b) +{ + return (a.family == b.family && a.subtype == b.subtype); +} + +/* performs subtraction between A and B event_hdl_sub_type + */ +static inline struct event_hdl_sub_type event_hdl_sub_type_del(struct event_hdl_sub_type a, struct event_hdl_sub_type b) +{ + if (unlikely(!a.family)) + a.family = b.family; + if (unlikely(a.family != b.family)) + return a; + a.subtype &= ~b.subtype; + + return a; +} + +/* performs addition between A and B event_hdl_sub_type + */ +static inline struct event_hdl_sub_type event_hdl_sub_type_add(struct event_hdl_sub_type a, struct event_hdl_sub_type b) +{ + if (unlikely(!a.family)) + a.family = b.family; + if (unlikely(a.family != b.family)) + return a; + a.subtype |= b.subtype; + + return a; +} + +/* use this function when you consumed an event in async handler + * (this will free the event so you must ensure that the event + * is already removed from the event queue and that you + * no longer make use of it) + */ +void event_hdl_async_free_event(struct event_hdl_async_event *e); + +/* use this for advanced async mode to initialize event queue */ +static inline void event_hdl_async_equeue_init(event_hdl_async_equeue *queue) +{ + MT_LIST_INIT(&queue->head); + queue->size = 0; +} + +/* use this for advanced async mode to pop an event from event queue */ +static inline struct event_hdl_async_event *event_hdl_async_equeue_pop(event_hdl_async_equeue *queue) +{ + struct event_hdl_async_event *event; + + event = MT_LIST_POP(&queue->head, struct event_hdl_async_event *, mt_list); + if (event) + HA_ATOMIC_DEC(&queue->size); + return event; +} + +/* use this for advanced async mode to check if the event queue is empty */ +static inline int event_hdl_async_equeue_isempty(event_hdl_async_equeue *queue) +{ + return MT_LIST_ISEMPTY(&queue->head); +} + +/* use this for advanced async mode to check if the event queue size */ +static inline uint32_t event_hdl_async_equeue_size(event_hdl_async_equeue *queue) +{ + return HA_ATOMIC_LOAD(&queue->size); +} + +/* use this to initialize <sub_list> event subscription list */ +void event_hdl_sub_list_init(event_hdl_sub_list *sub_list); + +/* use this function when you need to destroy <sub_list> + * event subscription list + * All subscriptions will be removed and properly freed according + * to their types + */ +void event_hdl_sub_list_destroy(event_hdl_sub_list *sub_list); + +/* event_hdl tunables */ +extern struct event_hdl_tune event_hdl_tune; + +#endif /* _HAPROXY_EVENT_HDL_H */ diff --git a/include/haproxy/extcheck.h b/include/haproxy/extcheck.h new file mode 100644 index 0000000..233d7c5 --- /dev/null +++ b/include/haproxy/extcheck.h @@ -0,0 +1,49 @@ +/* + * include/haproxy/extchecks.h + * Functions prototypes for the external checks. + * + * Copyright 2000-2009,2020 Willy Tarreau <w@1wt.eu> + * Copyright 2014 Horms Solutions Ltd, Simon Horman <horms@verge.net.au> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_EXTCHECK_H +#define _HAPROXY_EXTCHECK_H + +#include <haproxy/check-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/task-t.h> + +struct task *process_chk_proc(struct task *t, void *context, unsigned int state); +int prepare_external_check(struct check *check); +int init_pid_list(void); + +int proxy_parse_extcheck(char **args, int section, struct proxy *curpx, + struct proxy *defpx, const char *file, int line, + char **errmsg); + +int proxy_parse_external_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); + + +#endif /* _HAPROXY_EXTCHECK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/fcgi-app-t.h b/include/haproxy/fcgi-app-t.h new file mode 100644 index 0000000..fb6ab27 --- /dev/null +++ b/include/haproxy/fcgi-app-t.h @@ -0,0 +1,123 @@ +/* + * include/haproxy/fcgi-app-t.h + * This file defines everything related to FCGI applications. + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_FCGI_T_H +#define _HAPROXY_HTTP_FCGI_T_H + +#include <import/ebtree-t.h> +#include <import/ist.h> + +#include <haproxy/acl-t.h> +#include <haproxy/api-t.h> +#include <haproxy/arg-t.h> +#include <haproxy/fcgi.h> +#include <haproxy/filters-t.h> +#include <haproxy/regex-t.h> + +#define FCGI_APP_FL_KEEP_CONN 0x00000001 /* Keep the connection alive */ +#define FCGI_APP_FL_GET_VALUES 0x00000002 /* Retrieve FCGI variables on connection establishment */ +#define FCGI_APP_FL_MPXS_CONNS 0x00000004 /* FCGI APP supports connection multiplexing */ + + +enum fcgi_rule_type { + FCGI_RULE_SET_PARAM = 0, + FCGI_RULE_UNSET_PARAM, + FCGI_RULE_PASS_HDR, + FCGI_RULE_HIDE_HDR, +}; + +/* Used during configuration parsing only and converted into fcgi_rule when + * filter is created. + */ +struct fcgi_rule_conf { + enum fcgi_rule_type type; + char *name; + char *value; + struct acl_cond *cond; /* acl condition to set/unset the param */ + struct list list; +}; + +/* parameter rule evaluated during request analyzis */ +struct fcgi_rule { + enum fcgi_rule_type type; + struct ist name; /* name of the parameter/header */ + struct list value; /* log-format compatible expression, may be empty */ + struct acl_cond *cond; /* acl condition to set the param */ + struct list list; +}; + +/* parameter rule to set/unset a param at the end of the analyzis */ +struct fcgi_param_rule { + struct ist name; + struct list *value; /* if empty , unset the parameter */ + struct ebpt_node node; +}; + +/* header rule to pass/hide a header at the end of the analyzis */ +struct fcgi_hdr_rule { + struct ist name; + int pass; /* 1 to pass the header, 0 Otherwise */ + struct ebpt_node node; +}; + +struct fcgi_app { + char *name; /* name to identify this set of params */ + struct ist docroot; /* FCGI docroot */ + struct ist index; /* filename to append to URI ending by a '/' */ + struct my_regex *pathinfo_re; /* Regex to use to split scriptname and path-info */ + unsigned int flags; /* FCGI_APP_FL_* */ + struct list loggers; /* one per 'log' directive */ + unsigned int maxreqs; /* maximum number of concurrent requests */ + + struct list acls; /* list of acls declared for this application */ + + struct { + char *file; /* file where the section appears */ + int line; /* line where the section appears */ + struct list rules; /* list of rules used during config parsing */ + struct arg_list args; /* sample arg list that need to be resolved */ + } conf; /* config information */ + struct fcgi_app *next; /* used to chain fcgi-app */ +}; + +/* FCGI config attached to backend proxies */ +struct fcgi_flt_conf { + char *name; /* fcgi-app name used during config parsing */ + struct fcgi_app *app; /* configuration of the fcgi application */ + + struct list param_rules; /* list of set/unset rules */ + struct list hdr_rules; /* list of pass/add rules */ +}; + +/* FCGI context attached to streames */ +struct fcgi_flt_ctx { + struct filter *filter; + struct fcgi_app *app; +}; + +#endif /* _HAPROXY_HTTP_FCGI_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/fcgi-app.h b/include/haproxy/fcgi-app.h new file mode 100644 index 0000000..99f0d58 --- /dev/null +++ b/include/haproxy/fcgi-app.h @@ -0,0 +1,42 @@ +/* + * include/haproxy/fcgi-app.h + * This file defines function prototypes for FCGI applications. + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_FCGI_H +#define _HAPROXY_HTTP_FCGI_H + +#include <haproxy/fcgi-app-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stream-t.h> + +struct fcgi_app *fcgi_app_find_by_name(const char *name); +struct fcgi_flt_conf *find_px_fcgi_conf(struct proxy *px); +struct fcgi_flt_ctx *find_strm_fcgi_ctx(struct stream *s); +struct fcgi_app *get_px_fcgi_app(struct proxy *px); +struct fcgi_app *get_strm_fcgi_app(struct stream *s); + +#endif /* _HAPROXY_HTTP_FCGI_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/fcgi.h b/include/haproxy/fcgi.h new file mode 100644 index 0000000..e276d69 --- /dev/null +++ b/include/haproxy/fcgi.h @@ -0,0 +1,133 @@ +/* + * include/haproxy/fcgi.h + * This file contains FastCGI protocol definitions. + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FCGI_H +#define _HAPROXY_FCGI_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/buf-t.h> + +/* FCGI protocol version */ +#define FCGI_VERSION 0x1 + +/* flags for FCGI_BEGIN_REQUEST records */ +#define FCGI_KEEP_CONN 0x01 + +/* FCGI record's type */ +enum fcgi_record_type { + FCGI_BEGIN_REQUEST = 1, + FCGI_ABORT_REQUEST = 2, + FCGI_END_REQUEST = 3, + FCGI_PARAMS = 4, + FCGI_STDIN = 5, + FCGI_STDOUT = 6, + FCGI_STDERR = 7, + FCGI_DATA = 8, + FCGI_GET_VALUES = 9, + FCGI_GET_VALUES_RESULT = 10, + FCGI_UNKNOWN_TYPE = 11, + FCGI_ENTRIES +} __attribute__((packed)); + + +enum fcgi_role { + FCGI_RESPONDER = 1, + FCGI_AUTHORIZER = 2, /* Unsupported */ + FCGI_FILTER = 3, /* Unsupported */ +} __attribute__((packed)); + +/* Protocol status */ +enum fcgi_proto_status { + FCGI_PS_REQUEST_COMPLETE = 0, + FCGI_PS_CANT_MPX_CONN = 1, + FCGI_PS_OVERLOADED = 2, + FCGI_PS_UNKNOWN_ROLE = 3, + FCGI_PS_ENTRIES, +} __attribute__((packed)); + +struct fcgi_header { + uint8_t vsn; + uint8_t type; + uint16_t id; + uint16_t len; + uint8_t padding; + uint8_t rsv; +}; + +struct fcgi_param { + struct ist n; + struct ist v; +}; + +struct fcgi_begin_request { + enum fcgi_role role; + uint8_t flags; +}; + +struct fcgi_end_request { + uint32_t status; + uint8_t errcode; +}; + +struct fcgi_unknown_type { + uint8_t type; +}; + + +static inline const char *fcgi_rt_str(int type) +{ + switch (type) { + case FCGI_BEGIN_REQUEST : return "BEGIN_REQUEST"; + case FCGI_ABORT_REQUEST : return "ABORT_REQUEST"; + case FCGI_END_REQUEST : return "END_REQUEST"; + case FCGI_PARAMS : return "PARAMS"; + case FCGI_STDIN : return "STDIN"; + case FCGI_STDOUT : return "STDOUT"; + case FCGI_STDERR : return "STDERR"; + case FCGI_DATA : return "DATA"; + case FCGI_GET_VALUES : return "GET_VALUES"; + case FCGI_GET_VALUES_RESULT : return "GET_VALUES_RESULT"; + case FCGI_UNKNOWN_TYPE : return "UNKNOWN_TYPE"; + default : return "_UNKNOWN_"; + } +} + + +int fcgi_encode_record_hdr(struct buffer *out, const struct fcgi_header *h); +size_t fcgi_decode_record_hdr(const struct buffer *in, size_t o, struct fcgi_header *h); + +int fcgi_encode_begin_request(struct buffer *out, const struct fcgi_begin_request *r); + +int fcgi_encode_param(struct buffer *out, const struct fcgi_param *p); +size_t fcgi_decode_param(const struct buffer *in, size_t o, struct fcgi_param *p); +size_t fcgi_aligned_decode_param(const struct buffer *in, size_t o, struct fcgi_param *p); + +size_t fcgi_decode_end_request(const struct buffer *in, size_t o, struct fcgi_end_request *r); + +#endif /* _HAPROXY_FCGI_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/fd-t.h b/include/haproxy/fd-t.h new file mode 100644 index 0000000..c5e94cb --- /dev/null +++ b/include/haproxy/fd-t.h @@ -0,0 +1,251 @@ +/* + * include/haproxy/fd-t.h + * File descriptors states - check src/fd.c for explanations. + * + * Copyright (C) 2000-2014 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FD_T_H +#define _HAPROXY_FD_T_H + +#include <haproxy/api-t.h> +#include <haproxy/port_range-t.h> +#include <haproxy/show_flags-t.h> + +/* Direction for each FD event update */ +enum { + DIR_RD=0, + DIR_WR=1, +}; + + +/* fdtab[].state is a composite state describing what is known about the FD. + * For now, the following information are stored in it: + * - event configuration and status for each direction (R,W) split into + * active, ready, shutdown categories (FD_EV_*). These are known by their + * bit values as well so that test-and-set bit operations are possible. + * + * - last known polling status (FD_POLL_*). For ease of troubleshooting, + * avoid visually mixing these ones with the other ones above. 3 of these + * flags are updated on each poll() report (FD_POLL_IN, FD_POLL_OUT, + * FD_POLL_PRI). FD_POLL_HUP and FD_POLL_ERR are "sticky" in that once they + * are reported, they will not be cleared until the FD is closed. + */ + +/* bits positions for a few flags */ +#define FD_EV_ACTIVE_R_BIT 0 +#define FD_EV_READY_R_BIT 1 +#define FD_EV_SHUT_R_BIT 2 +/* unused: 3 */ + +#define FD_EV_ACTIVE_W_BIT 4 +#define FD_EV_READY_W_BIT 5 +#define FD_EV_SHUT_W_BIT 6 +#define FD_EV_ERR_RW_BIT 7 + +#define FD_POLL_IN_BIT 8 +#define FD_POLL_PRI_BIT 9 +#define FD_POLL_OUT_BIT 10 +#define FD_POLL_ERR_BIT 11 +#define FD_POLL_HUP_BIT 12 + +/* info/config bits */ +#define FD_LINGER_RISK_BIT 16 /* must kill lingering before closing */ +#define FD_CLONED_BIT 17 /* cloned socket, requires EPOLL_CTL_DEL on close */ +#define FD_INITIALIZED_BIT 18 /* init phase was done (e.g. output pipe set non-blocking) */ +#define FD_ET_POSSIBLE_BIT 19 /* edge-triggered is possible on this FD */ +#define FD_EXPORTED_BIT 20 /* FD is exported and must not be closed */ +#define FD_EXCL_SYSCALL_BIT 21 /* a syscall claims exclusivity on this FD */ +#define FD_DISOWN_BIT 22 /* this fd will be closed by some external code */ +#define FD_MUST_CLOSE_BIT 23 /* this fd will be closed by some external code */ + + +/* and flag values */ +#define FD_EV_ACTIVE_R (1U << FD_EV_ACTIVE_R_BIT) +#define FD_EV_ACTIVE_W (1U << FD_EV_ACTIVE_W_BIT) +#define FD_EV_ACTIVE_RW (FD_EV_ACTIVE_R | FD_EV_ACTIVE_W) + +#define FD_EV_READY_R (1U << FD_EV_READY_R_BIT) +#define FD_EV_READY_W (1U << FD_EV_READY_W_BIT) +#define FD_EV_READY_RW (FD_EV_READY_R | FD_EV_READY_W) + +/* note that when FD_EV_SHUT is set, ACTIVE and READY are cleared */ +#define FD_EV_SHUT_R (1U << FD_EV_SHUT_R_BIT) +#define FD_EV_SHUT_W (1U << FD_EV_SHUT_W_BIT) +#define FD_EV_SHUT_RW (FD_EV_SHUT_R | FD_EV_SHUT_W) + +/* note that when FD_EV_ERR is set, SHUT is also set. Also, ERR is for both + * directions at once (write error, socket dead, etc). + */ +#define FD_EV_ERR_RW (1U << FD_EV_ERR_RW_BIT) + +/* mask covering all use cases above */ +#define FD_EV_ANY (FD_EV_ACTIVE_RW | FD_EV_READY_RW | FD_EV_SHUT_RW | FD_EV_ERR_RW) + +/* polling status */ +#define FD_POLL_IN (1U << FD_POLL_IN_BIT) +#define FD_POLL_PRI (1U << FD_POLL_PRI_BIT) +#define FD_POLL_OUT (1U << FD_POLL_OUT_BIT) +#define FD_POLL_ERR (1U << FD_POLL_ERR_BIT) +#define FD_POLL_HUP (1U << FD_POLL_HUP_BIT) +#define FD_POLL_UPDT_MASK (FD_POLL_IN | FD_POLL_PRI | FD_POLL_OUT) +#define FD_POLL_ANY_MASK (FD_POLL_IN | FD_POLL_PRI | FD_POLL_OUT | FD_POLL_ERR | FD_POLL_HUP) + +/* information/configuration flags */ +#define FD_LINGER_RISK (1U << FD_LINGER_RISK_BIT) +#define FD_CLONED (1U << FD_CLONED_BIT) +#define FD_INITIALIZED (1U << FD_INITIALIZED_BIT) +#define FD_ET_POSSIBLE (1U << FD_ET_POSSIBLE_BIT) +#define FD_EXPORTED (1U << FD_EXPORTED_BIT) +#define FD_EXCL_SYSCALL (1U << FD_EXCL_SYSCALL_BIT) +#define FD_DISOWN (1U << FD_DISOWN_BIT) +#define FD_MUST_CLOSE (1U << FD_MUST_CLOSE_BIT) + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *fd_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(FD_EV_ACTIVE_R, _(FD_EV_ACTIVE_W, _(FD_EV_READY_R, _(FD_EV_READY_W, + _(FD_EV_SHUT_R, _(FD_EV_SHUT_W, _(FD_EV_ERR_RW, _(FD_POLL_IN, + _(FD_POLL_PRI, _(FD_POLL_OUT, _(FD_POLL_ERR, _(FD_POLL_HUP, + _(FD_LINGER_RISK, _(FD_CLONED, _(FD_INITIALIZED, _(FD_ET_POSSIBLE, + _(FD_EXPORTED, _(FD_EXCL_SYSCALL, _(FD_DISOWN))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* FD update status after fd_update_events() */ +enum { + FD_UPDT_DONE = 0, // update done, nothing else to be done + FD_UPDT_CLOSED, // FD was closed + FD_UPDT_MIGRATED, // FD was migrated, ignore it now +}; + +/* This is the value used to mark a file descriptor as dead. This value is + * negative, this is important so that tests on fd < 0 properly match. It + * also has the nice property of being highly negative but neither overflowing + * nor changing sign on 32-bit machines when multiplied by sizeof(fdtab). + * This ensures that any unexpected dereference of such an uninitialized + * file descriptor will lead to so large a dereference that it will crash + * the process at the exact location of the bug with a clean stack trace + * instead of causing silent manipulation of other FDs. And it's readable + * when found in a dump. + */ +#define DEAD_FD_MAGIC 0xFDDEADFD + +/* fdlist_entry: entry used by the fd cache. + * >= 0 means we're in the cache and gives the FD of the next in the cache, + * -1 means we're in the cache and the last element, + * -2 means the entry is locked, + * <= -3 means not in the cache, and next element is -4-fd + * + * It must remain 8-aligned so that aligned CAS operations may be done on both + * entries at once. + */ +struct fdlist_entry { + int next; + int prev; +} ALIGNED(8); + +/* head of the fd cache, per-group */ +struct fdlist { + int first; + int last; +} ALIGNED(64); + +/* info about one given fd. Note: only align on cache lines when using threads; + * 32-bit small archs can put everything in 32-bytes when threads are disabled. + * refc_tgid is an atomic 32-bit composite value made of 16 higher bits + * containing a refcount on tgid and the running_mask, and 16 lower bits + * containing a thread group ID and a lock bit on the 16th. The tgid may only + * be changed when refc is zero and running may only be checked/changed when + * refc is held and shows the reader is alone. An FD with tgid zero belongs to + * nobody. + */ +struct fdtab { + unsigned long running_mask; /* mask of thread IDs currently using the fd */ + unsigned long thread_mask; /* mask of thread IDs authorized to process the fd */ + unsigned long update_mask; /* mask of thread IDs having an update for fd */ + struct fdlist_entry update; /* Entry in the global update list */ + void (*iocb)(int fd); /* I/O handler */ + void *owner; /* the connection or listener associated with this fd, NULL if closed */ + unsigned int state; /* FD state for read and write directions (FD_EV_*) + FD_POLL_* */ + unsigned int refc_tgid; /* refcounted tgid, updated atomically */ +#ifdef DEBUG_FD + unsigned int event_count; /* number of events reported */ +#endif +} THREAD_ALIGNED(64); + +/* polled mask, one bit per thread and per direction for each FD */ +struct polled_mask { + unsigned long poll_recv; + unsigned long poll_send; +}; + +/* less often used information */ +struct fdinfo { + struct port_range *port_range; /* optional port range to bind to */ + int local_port; /* optional local port */ +}; + +/* + * Poller descriptors. + * - <name> is initialized by the poller's register() function, and should not + * be allocated, just linked to. + * - <pref> is initialized by the poller's register() function. It is set to 0 + * by default, meaning the poller is disabled. init() should set it to 0 in + * case of failure. term() must set it to 0. A generic unoptimized select() + * poller should set it to 100. + * - <private> is initialized by the poller's init() function, and cleaned by + * the term() function. + * - clo() should be used to do indicate the poller that fd will be closed. + * - poll() calls the poller, expiring at <exp>, or immediately if <wake> is set + * - flags indicate what the poller supports (HAP_POLL_F_*) + */ + +#define HAP_POLL_F_RDHUP 0x00000001 /* the poller notifies of HUP with reads */ +#define HAP_POLL_F_ERRHUP 0x00000002 /* the poller reports ERR and HUP */ + +struct poller { + void *private; /* any private data for the poller */ + void (*clo)(const int fd); /* mark <fd> as closed */ + void (*poll)(struct poller *p, int exp, int wake); /* the poller itself */ + int (*init)(struct poller *p); /* poller initialization */ + void (*term)(struct poller *p); /* termination of this poller */ + int (*test)(struct poller *p); /* pre-init check of the poller */ + int (*fork)(struct poller *p); /* post-fork re-opening */ + const char *name; /* poller name */ + unsigned int flags; /* HAP_POLL_F_* */ + int pref; /* try pollers with higher preference first */ +}; + +#endif /* _HAPROXY_FD_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/fd.h b/include/haproxy/fd.h new file mode 100644 index 0000000..11212ff --- /dev/null +++ b/include/haproxy/fd.h @@ -0,0 +1,542 @@ +/* + * include/haproxy/fd.h + * File descriptors states - exported variables and functions + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FD_H +#define _HAPROXY_FD_H + +#include <sys/time.h> +#include <sys/types.h> +#include <stdio.h> +#include <unistd.h> +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/atomic.h> +#include <haproxy/fd-t.h> +#include <haproxy/global.h> +#include <haproxy/thread.h> + +/* public variables */ + +extern struct poller cur_poller; /* the current poller */ +extern int nbpollers; +extern struct poller pollers[MAX_POLLERS]; /* all registered pollers */ +extern struct fdtab *fdtab; /* array of all the file descriptors */ +extern struct fdinfo *fdinfo; /* less-often used infos for file descriptors */ +extern int totalconn; /* total # of terminated sessions */ +extern int actconn; /* # of active sessions */ + +extern volatile struct fdlist update_list[MAX_TGROUPS]; +extern struct polled_mask *polled_mask; + +extern THREAD_LOCAL int *fd_updt; // FD updates list +extern THREAD_LOCAL int fd_nbupdt; // number of updates in the list + +extern int poller_wr_pipe[MAX_THREADS]; + +extern volatile int ha_used_fds; // Number of FDs we're currently using + +/* Deletes an FD from the fdsets. + * The file descriptor is also closed. + */ +void fd_delete(int fd); +void _fd_delete_orphan(int fd); + +/* makes the new fd non-blocking and clears all other O_* flags; + * this is meant to be used on new FDs. Returns -1 on failure. + */ +int fd_set_nonblock(int fd); + +/* makes the fd close-on-exec; returns -1 on failure. */ +int fd_set_cloexec(int fd); + +/* Migrate a FD to a new thread <new_tid>. */ +void fd_migrate_on(int fd, uint new_tid); + +/* + * Take over a FD belonging to another thread. + * Returns 0 on success, and -1 on failure. + */ +int fd_takeover(int fd, void *expected_owner); + +ssize_t fd_write_frag_line(int fd, size_t maxlen, const struct ist pfx[], size_t npfx, const struct ist msg[], size_t nmsg, int nl); + +/* close all FDs starting from <start> */ +void my_closefrom(int start); + +struct rlimit; +int raise_rlim_nofile(struct rlimit *old_limit, struct rlimit *new_limit); + +int compute_poll_timeout(int next); +void fd_leaving_poll(int wait_time, int status); + +/* disable the specified poller */ +void disable_poller(const char *poller_name); + +void poller_pipe_io_handler(int fd); + +/* + * Initialize the pollers till the best one is found. + * If none works, returns 0, otherwise 1. + * The pollers register themselves just before main() is called. + */ +int init_pollers(void); + +/* + * Deinitialize the pollers. + */ +void deinit_pollers(void); + +/* + * Some pollers may lose their connection after a fork(). It may be necessary + * to create initialize part of them again. Returns 0 in case of failure, + * otherwise 1. The fork() function may be NULL if unused. In case of error, + * the the current poller is destroyed and the caller is responsible for trying + * another one by calling init_pollers() again. + */ +int fork_poller(void); + +/* + * Lists the known pollers on <out>. + * Should be performed only before initialization. + */ +int list_pollers(FILE *out); + +/* + * Runs the polling loop + */ +void run_poller(); + +void fd_add_to_fd_list(volatile struct fdlist *list, int fd); +void fd_rm_from_fd_list(volatile struct fdlist *list, int fd); +void updt_fd_polling(const int fd); +int fd_update_events(int fd, uint evts); +void fd_reregister_all(int tgrp, ulong mask); + +/* Called from the poller to acknowledge we read an entry from the global + * update list, to remove our bit from the update_mask, and remove it from + * the list if we were the last one. + */ +static inline void done_update_polling(int fd) +{ + unsigned long update_mask; + + update_mask = _HA_ATOMIC_AND_FETCH(&fdtab[fd].update_mask, ~ti->ltid_bit); + while ((update_mask & _HA_ATOMIC_LOAD(&tg->threads_enabled)) == 0) { + /* If we were the last one that had to update that entry, remove it from the list */ + fd_rm_from_fd_list(&update_list[tgid - 1], fd); + update_mask = _HA_ATOMIC_LOAD(&fdtab[fd].update_mask); + if ((update_mask & _HA_ATOMIC_LOAD(&tg->threads_enabled)) != 0) { + /* Maybe it's been re-updated in the meanwhile, and we + * wrongly removed it from the list, if so, re-add it + */ + fd_add_to_fd_list(&update_list[tgid - 1], fd); + update_mask = _HA_ATOMIC_LOAD(&fdtab[fd].update_mask); + /* And then check again, just in case after all it + * should be removed, even if it's very unlikely, given + * the current thread wouldn't have been able to take + * care of it yet */ + } else + break; + } +} + +/* + * returns true if the FD is active for recv + */ +static inline int fd_recv_active(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_R; +} + +/* + * returns true if the FD is ready for recv + */ +static inline int fd_recv_ready(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_READY_R; +} + +/* + * returns true if the FD is active for send + */ +static inline int fd_send_active(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_W; +} + +/* + * returns true if the FD is ready for send + */ +static inline int fd_send_ready(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_READY_W; +} + +/* + * returns true if the FD is active for recv or send + */ +static inline int fd_active(const int fd) +{ + return (unsigned)fdtab[fd].state & FD_EV_ACTIVE_RW; +} + +/* Disable processing recv events on fd <fd> */ +static inline void fd_stop_recv(int fd) +{ + if (!(fdtab[fd].state & FD_EV_ACTIVE_R) || + !HA_ATOMIC_BTR(&fdtab[fd].state, FD_EV_ACTIVE_R_BIT)) + return; +} + +/* Disable processing send events on fd <fd> */ +static inline void fd_stop_send(int fd) +{ + if (!(fdtab[fd].state & FD_EV_ACTIVE_W) || + !HA_ATOMIC_BTR(&fdtab[fd].state, FD_EV_ACTIVE_W_BIT)) + return; +} + +/* Disable processing of events on fd <fd> for both directions. */ +static inline void fd_stop_both(int fd) +{ + uint old, new; + + old = fdtab[fd].state; + do { + if (!(old & FD_EV_ACTIVE_RW)) + return; + new = old & ~FD_EV_ACTIVE_RW; + } while (unlikely(!_HA_ATOMIC_CAS(&fdtab[fd].state, &old, new))); +} + +/* Report that FD <fd> cannot receive anymore without polling (EAGAIN detected). */ +static inline void fd_cant_recv(const int fd) +{ + /* marking ready never changes polled status */ + if (!(fdtab[fd].state & FD_EV_READY_R) || + !HA_ATOMIC_BTR(&fdtab[fd].state, FD_EV_READY_R_BIT)) + return; +} + +/* Report that FD <fd> may receive again without polling. */ +static inline void fd_may_recv(const int fd) +{ + /* marking ready never changes polled status */ + if ((fdtab[fd].state & FD_EV_READY_R) || + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_READY_R_BIT)) + return; +} + +/* Report that FD <fd> may receive again without polling but only if its not + * active yet. This is in order to speculatively try to enable I/Os when it's + * highly likely that these will succeed, but without interfering with polling. + */ +static inline void fd_cond_recv(const int fd) +{ + if ((fdtab[fd].state & (FD_EV_ACTIVE_R|FD_EV_READY_R)) == 0) + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_READY_R_BIT); +} + +/* Report that FD <fd> may send again without polling but only if its not + * active yet. This is in order to speculatively try to enable I/Os when it's + * highly likely that these will succeed, but without interfering with polling. + */ +static inline void fd_cond_send(const int fd) +{ + if ((fdtab[fd].state & (FD_EV_ACTIVE_W|FD_EV_READY_W)) == 0) + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_READY_W_BIT); +} + +/* Report that FD <fd> may receive and send without polling. Used at FD + * initialization. + */ +static inline void fd_may_both(const int fd) +{ + HA_ATOMIC_OR(&fdtab[fd].state, FD_EV_READY_RW); +} + +/* Report that FD <fd> cannot send anymore without polling (EAGAIN detected). */ +static inline void fd_cant_send(const int fd) +{ + /* removing ready never changes polled status */ + if (!(fdtab[fd].state & FD_EV_READY_W) || + !HA_ATOMIC_BTR(&fdtab[fd].state, FD_EV_READY_W_BIT)) + return; +} + +/* Report that FD <fd> may send again without polling (EAGAIN not detected). */ +static inline void fd_may_send(const int fd) +{ + /* marking ready never changes polled status */ + if ((fdtab[fd].state & FD_EV_READY_W) || + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_READY_W_BIT)) + return; +} + +/* Prepare FD <fd> to try to receive */ +static inline void fd_want_recv(int fd) +{ + if ((fdtab[fd].state & FD_EV_ACTIVE_R) || + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_ACTIVE_R_BIT)) + return; + updt_fd_polling(fd); +} + +/* Prepare FD <fd> to try to receive, and only create update if fd_updt exists + * (essentially for receivers during early boot). + */ +static inline void fd_want_recv_safe(int fd) +{ + if ((fdtab[fd].state & FD_EV_ACTIVE_R) || + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_ACTIVE_R_BIT)) + return; + if (fd_updt) + updt_fd_polling(fd); +} + +/* Prepare FD <fd> to try to send */ +static inline void fd_want_send(int fd) +{ + if ((fdtab[fd].state & FD_EV_ACTIVE_W) || + HA_ATOMIC_BTS(&fdtab[fd].state, FD_EV_ACTIVE_W_BIT)) + return; + updt_fd_polling(fd); +} + +/* returns the tgid from an fd (masks the refcount) */ +static forceinline int fd_tgid(int fd) +{ + return _HA_ATOMIC_LOAD(&fdtab[fd].refc_tgid) & 0xFFFF; +} + +/* Release a tgid previously taken by fd_grab_tgid() */ +static forceinline void fd_drop_tgid(int fd) +{ + HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); +} + +/* Unlock a tgid currently locked by fd_lock_tgid(). This will effectively + * allow threads from the FD's tgid to check the masks and manipulate the FD. + */ +static forceinline void fd_unlock_tgid(int fd) +{ + HA_ATOMIC_AND(&fdtab[fd].refc_tgid, 0xffff7fffU); +} + +/* Switch the FD's TGID to the new value with a refcount of 1 and the lock bit + * set. It doesn't care about the current TGID, except that it will wait for + * the FD not to be already switching and having its refcount cleared. After + * the function returns, the caller is free to manipulate the masks, and it + * must call fd_unlock_tgid() to drop the lock, allowing threads from the + * designated group to use the FD. Finally a call to fd_drop_tgid() will be + * needed to drop the reference. + */ +static inline void fd_lock_tgid(int fd, uint desired_tgid) +{ + uint old; + + BUG_ON(!desired_tgid); + + old = tgid; // assume we start from the caller's tgid + desired_tgid |= 0x18000; // refcount=1, lock bit=1. + + while (1) { + old &= 0x7fff; // expect no lock and refcount==0 + if (_HA_ATOMIC_CAS(&fdtab[fd].refc_tgid, &old, desired_tgid)) + break; + __ha_cpu_relax(); + } +} + +/* Grab a reference to the FD's TGID, and return the tgid. Note that a TGID of + * zero indicates the FD was closed, thus also fails (i.e. no need to drop it). + * On non-zero (success), the caller must release it using fd_drop_tgid(). + */ +static inline uint fd_take_tgid(int fd) +{ + uint old; + + old = _HA_ATOMIC_FETCH_ADD(&fdtab[fd].refc_tgid, 0x10000) & 0xffff; + if (likely(old)) + return old; + HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); + return 0; +} + +/* Reset a tgid without affecting the refcount */ +static forceinline void fd_reset_tgid(int fd) +{ + HA_ATOMIC_AND(&fdtab[fd].refc_tgid, 0xffff0000U); +} + +/* Try to grab a reference to the FD's TGID, but only if it matches the + * requested one (i.e. it succeeds with TGID refcnt held, or fails). Note that + * a TGID of zero indicates the FD was closed, thus also fails. It returns + * non-zero on success, in which case the caller must then release it using + * fd_drop_tgid(), or zero on failure. The function is optimized for use + * when it's likely that the tgid matches the desired one as it's by far + * the most common. + */ +static inline uint fd_grab_tgid(int fd, uint desired_tgid) +{ + uint old; + + old = _HA_ATOMIC_FETCH_ADD(&fdtab[fd].refc_tgid, 0x10000) & 0xffff; + if (likely(old == desired_tgid)) + return 1; + HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); + return 0; +} + +/* Set the FD's TGID to the new value with a refcount of 1, waiting for the + * current refcount to become 0, to cover the rare possibly that a late + * competing thread would be touching the tgid or the running mask in parallel. + * The caller must call fd_drop_tgid() once done. + */ +static inline void fd_claim_tgid(int fd, uint desired_tgid) +{ + uint old; + + BUG_ON(!desired_tgid); + + desired_tgid += 0x10000; // refcount=1 + old = 0; // assume unused (most likely) + while (1) { + if (_HA_ATOMIC_CAS(&fdtab[fd].refc_tgid, &old, desired_tgid)) + break; + __ha_cpu_relax(); + old &= 0x7fff; // keep only the tgid and drop the lock + } +} + +/* atomically read the running mask if the tgid matches, or returns zero if it + * does not match. This is meant for use in code paths where the bit is expected + * to be present and will be sufficient to protect against a short-term group + * migration (e.g. takss and return from iocb). + */ +static inline ulong fd_get_running(int fd, uint desired_tgid) +{ + ulong ret = 0; + uint old; + + /* TODO: may also be checked using an atomic double-load from a DWCAS + * on compatible architectures, which wouldn't require to modify nor + * restore the original value. + */ + old = _HA_ATOMIC_ADD_FETCH(&fdtab[fd].refc_tgid, 0x10000); + if (likely((old & 0xffff) == desired_tgid)) + ret = _HA_ATOMIC_LOAD(&fdtab[fd].running_mask); + _HA_ATOMIC_SUB(&fdtab[fd].refc_tgid, 0x10000); + return ret; +} + +/* remove tid_bit from the fd's running mask and returns the value before the + * atomic operation, so that the caller can know if it was present. + */ +static inline long fd_clr_running(int fd) +{ + return _HA_ATOMIC_FETCH_AND(&fdtab[fd].running_mask, ~ti->ltid_bit); +} + +/* Prepares <fd> for being polled on all permitted threads of this group ID + * (these will then be refined to only cover running ones). +*/ +static inline void fd_insert(int fd, void *owner, void (*iocb)(int fd), int tgid, unsigned long thread_mask) +{ + extern void sock_conn_iocb(int); + int newstate; + + /* conn_fd_handler should support edge-triggered FDs */ + newstate = 0; + if ((global.tune.options & GTUNE_FD_ET) && iocb == sock_conn_iocb) + newstate |= FD_ET_POSSIBLE; + + /* This must never happen and would definitely indicate a bug, in + * addition to overwriting some unexpected memory areas. + */ + BUG_ON(fd < 0); + BUG_ON(fd >= global.maxsock); + BUG_ON(fdtab[fd].owner != NULL); + BUG_ON(fdtab[fd].state != 0); + BUG_ON(tgid < 1 || tgid > MAX_TGROUPS); + + thread_mask &= tg->threads_enabled; + BUG_ON(thread_mask == 0); + + fd_claim_tgid(fd, tgid); + + BUG_ON(fdtab[fd].running_mask); + + fdtab[fd].owner = owner; + fdtab[fd].iocb = iocb; + fdtab[fd].state = newstate; + fdtab[fd].thread_mask = thread_mask; + fd_drop_tgid(fd); + +#ifdef DEBUG_FD + fdtab[fd].event_count = 0; +#endif + + /* note: do not reset polled_mask here as it indicates which poller + * still knows this FD from a possible previous round. + */ + + /* the two directions are ready until proven otherwise */ + fd_may_both(fd); + _HA_ATOMIC_INC(&ha_used_fds); +} + +/* These are replacements for FD_SET, FD_CLR, FD_ISSET, working on uints */ +static inline void hap_fd_set(int fd, unsigned int *evts) +{ + _HA_ATOMIC_OR(&evts[fd / (8*sizeof(*evts))], 1U << (fd & (8*sizeof(*evts) - 1))); +} + +static inline void hap_fd_clr(int fd, unsigned int *evts) +{ + _HA_ATOMIC_AND(&evts[fd / (8*sizeof(*evts))], ~(1U << (fd & (8*sizeof(*evts) - 1)))); +} + +static inline unsigned int hap_fd_isset(int fd, unsigned int *evts) +{ + return evts[fd / (8*sizeof(*evts))] & (1U << (fd & (8*sizeof(*evts) - 1))); +} + +/* send a wake-up event to this thread, only if it's asleep and not notified yet */ +static inline void wake_thread(int thr) +{ + struct thread_ctx *ctx = &ha_thread_ctx[thr]; + + if ((_HA_ATOMIC_FETCH_OR(&ctx->flags, TH_FL_NOTIFIED) & (TH_FL_SLEEPING|TH_FL_NOTIFIED)) == TH_FL_SLEEPING) { + char c = 'c'; + DISGUISE(write(poller_wr_pipe[thr], &c, 1)); + } +} + + +#endif /* _HAPROXY_FD_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/filters-t.h b/include/haproxy/filters-t.h new file mode 100644 index 0000000..c86ef6f --- /dev/null +++ b/include/haproxy/filters-t.h @@ -0,0 +1,258 @@ +/* + * include/haproxy/filteers-t.h + * This file defines everything related to stream filters. + * + * Copyright (C) 2015 Qualys Inc., Christopher Faulet <cfaulet@qualys.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _HAPROXY_FILTERS_T_H +#define _HAPROXY_FILTERS_T_H + +#include <haproxy/api-t.h> + +/* Flags set on a filter config */ +#define FLT_CFG_FL_HTX 0x00000001 /* The filter can filter HTX streams */ + +/* Flags set on a filter instance */ +#define FLT_FL_IS_BACKEND_FILTER 0x0001 /* The filter is a backend filter */ +#define FLT_FL_IS_REQ_DATA_FILTER 0x0002 /* The filter will parse data on the request channel */ +#define FLT_FL_IS_RSP_DATA_FILTER 0x0004 /* The filter will parse data on the response channel */ + +/* Flags set on the stream, common to all filters attached to its stream */ +#define STRM_FLT_FL_HAS_FILTERS 0x0001 /* The stream has at least one filter */ +#define STRM_FLT_FL_HOLD_HTTP_HDRS 0x0002 /* At least one filter on the stream want to hold the message headers */ + + +struct http_msg; +struct proxy; +struct stream; +struct channel; +struct flt_conf; +struct filter; + +/* Descriptor for a "filter" keyword. The ->parse() function returns 0 in case + * of success, or a combination of ERR_* flags if an error is encountered. The + * function pointer can be NULL if not implemented. + */ +struct flt_kw { + const char *kw; + int (*parse)(char **args, int *cur_arg, struct proxy *px, + struct flt_conf *fconf, char **err, void *private); + void *private; +}; + +/* + * A keyword list. It is a NULL-terminated array of keywords. It embeds a struct + * list in order to be linked to other lists, allowing it to easily be declared + * where it is needed, and linked without duplicating data nor allocating + * memory. It is also possible to indicate a scope for the keywords. + */ +struct flt_kw_list { + const char *scope; + struct list list; + struct flt_kw kw[VAR_ARRAY]; +}; + +/* + * Callbacks available on a filter: + * + * - init : Initializes the filter for a proxy. Returns a + * negative value if an error occurs. + * - deinit : Cleans up what the init function has done. + * - check : Check the filter config for a proxy. Returns the + * number of errors encountered. + * - init_per_thread : Initializes the filter for a proxy for a specific + * thread. Returns a negative value if an error + * occurs. + * - deinit_per_thread : Cleans up what the init_per_thread function has + * done. + * + * + * - attach : Called after a filter instance creation, when it is + * attached to a stream. This happens when the stream + * is started for filters defined on the stream's + * frontend and when the backend is set for filters + * declared on the stream's backend. + * Returns a negative value if an error occurs, 0 if + * the filter must be ignored for the stream, any other + * value otherwise. + * - stream_start : Called when a stream is started. This callback will + * only be called for filters defined on the stream's + * frontend. + * Returns a negative value if an error occurs, any + * other value otherwise. + * - stream_set_backend : Called when a backend is set for a stream. This + * callbacks will be called for all filters attached + * to a stream (frontend and backend). + * Returns a negative value if an error occurs, any + * other value otherwise. + * - stream_stop : Called when a stream is stopped. This callback will + * only be called for filters defined on the stream's + * frontend. + * - detach : Called when a filter instance is detached from a + * stream, before its destruction. This happens when + * the stream is stopped for filters defined on the + * stream's frontend and when the analyze ends for + * filters defined on the stream's backend. + * - check_timeouts : Called when a a stream is woken up because of an + * expired timer. + * + * + * - channel_start_analyze: Called when a filter starts to analyze a channel. + * Returns a negative value if an error occurs, 0 if + * it needs to wait, any other value otherwise. + * - channel_pre_analyze : Called before each analyzer attached to a channel, + * expects analyzers responsible for data sending. + * Returns a negative value if an error occurs, 0 if + * it needs to wait, any other value otherwise. + * - channel_post_analyze: Called after each analyzer attached to a channel, + * expects analyzers responsible for data sending. + * Returns a negative value if an error occurs, + * any other value otherwise. + * - channel_end_analyze : Called when all other analyzers have finished their + * processing. + * Returns a negative value if an error occurs, 0 if + * it needs to wait, any other value otherwise. + * + * + * - http_headers : Called before the body parsing, after all HTTP + * headers was parsed and analyzed. + * Returns a negative value if an error occurs, 0 if + * it needs to wait, any other value otherwise. + * - http_payload : Called when some data can be consumed. + * Returns a negative value if an error occurs, else + * the number of forwarded bytes. + * - http_end : Called when all the request/response has been + * processed and all body data has been forwarded. + * Returns a negative value if an error occurs, 0 if + * it needs to wait for some reason, any other value + * otherwise. + * - http_reset : Called when the HTTP message is reset. It happens + * either when a 100-continue response is received. + * that can be detected if s->txn->status is 10X, or + * if we're attempting a L7 retry. + * Returns nothing. + * - http_reply : Called when, at any time, HAProxy decides to stop + * the HTTP message's processing and to send a message + * to the client (mainly, when an error or a redirect + * occur). + * Returns nothing. + * + * + * - tcp_payload : Called when some data can be consumed. + * Returns a negative value if an error occurs, else + * the number of forwarded bytes. + */ +struct flt_ops { + /* + * Callbacks to manage the filter lifecycle + */ + int (*init) (struct proxy *p, struct flt_conf *fconf); + void (*deinit) (struct proxy *p, struct flt_conf *fconf); + int (*check) (struct proxy *p, struct flt_conf *fconf); + int (*init_per_thread) (struct proxy *p, struct flt_conf *fconf); + void (*deinit_per_thread)(struct proxy *p, struct flt_conf *fconf); + /* + * Stream callbacks + */ + int (*attach) (struct stream *s, struct filter *f); + int (*stream_start) (struct stream *s, struct filter *f); + int (*stream_set_backend)(struct stream *s, struct filter *f, struct proxy *be); + void (*stream_stop) (struct stream *s, struct filter *f); + void (*detach) (struct stream *s, struct filter *f); + void (*check_timeouts) (struct stream *s, struct filter *f); + /* + * Channel callbacks + */ + int (*channel_start_analyze)(struct stream *s, struct filter *f, struct channel *chn); + int (*channel_pre_analyze) (struct stream *s, struct filter *f, struct channel *chn, unsigned int an_bit); + int (*channel_post_analyze) (struct stream *s, struct filter *f, struct channel *chn, unsigned int an_bit); + int (*channel_end_analyze) (struct stream *s, struct filter *f, struct channel *chn); + + /* + * HTTP callbacks + */ + int (*http_headers) (struct stream *s, struct filter *f, struct http_msg *msg); + int (*http_payload) (struct stream *s, struct filter *f, struct http_msg *msg, + unsigned int offset, unsigned int len); + int (*http_end) (struct stream *s, struct filter *f, struct http_msg *msg); + + void (*http_reset) (struct stream *s, struct filter *f, struct http_msg *msg); + void (*http_reply) (struct stream *s, struct filter *f, short status, + const struct buffer *msg); + + /* + * TCP callbacks + */ + int (*tcp_payload) (struct stream *s, struct filter *f, struct channel *chn, + unsigned int offset, unsigned int len); +}; + +/* + * Structure representing the filter configuration, attached to a proxy and + * accessible from a filter when instantiated in a stream + */ +struct flt_conf { + const char *id; /* The filter id */ + struct flt_ops *ops; /* The filter callbacks */ + void *conf; /* The filter configuration */ + struct list list; /* Next filter for the same proxy */ + unsigned int flags; /* FLT_CFG_FL_* */ +}; + +/* + * Structure reprensenting a filter instance attached to a stream + * + * 2D-Array fields are used to store info per channel. The first index stands + * for the request channel, and the second one for the response channel. + * Especially, <next> and <fwd> are offsets representing amount of data that the + * filter are, respectively, parsed and forwarded on a channel. Filters can + * access these values using FLT_NXT and FLT_FWD macros. + */ +struct filter { + struct flt_conf *config; /* the filter's configuration */ + void *ctx; /* The filter context (opaque) */ + unsigned short flags; /* FLT_FL_* */ + unsigned long long offset[2]; /* Offset of input data already filtered for a specific channel + * 0: request channel, 1: response channel */ + unsigned int pre_analyzers; /* bit field indicating analyzers to pre-process */ + unsigned int post_analyzers; /* bit field indicating analyzers to post-process */ + struct list list; /* Next filter for the same proxy/stream */ +}; + +/* + * Structure reprensenting the "global" state of filters attached to a stream. + */ +struct strm_flt { + struct list filters; /* List of filters attached to a stream */ + struct filter *current[2]; /* From which filter resume processing, for a specific channel. + * This is used for resumable callbacks only, + * If NULL, we start from the first filter. + * 0: request channel, 1: response channel */ + unsigned short flags; /* STRM_FL_* */ + unsigned char nb_req_data_filters; /* Number of data filters registered on the request channel */ + unsigned char nb_rsp_data_filters; /* Number of data filters registered on the response channel */ + unsigned long long offset[2]; +}; + +#endif /* _HAPROXY_FILTERS_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/filters.h b/include/haproxy/filters.h new file mode 100644 index 0000000..4a32c21 --- /dev/null +++ b/include/haproxy/filters.h @@ -0,0 +1,187 @@ +/* + * include/haproxy/filters.h + * This file defines function prototypes for stream filters management. + * + * Copyright (C) 2015 Qualys Inc., Christopher Faulet <cfaulet@qualys.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _HAPROXY_FILTERS_H +#define _HAPROXY_FILTERS_H + +#include <haproxy/channel.h> +#include <haproxy/filters-t.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stream-t.h> + +extern const char *trace_flt_id; +extern const char *http_comp_flt_id; +extern const char *cache_store_flt_id; +extern const char *spoe_filter_id; +extern const char *fcgi_flt_id; + +#define FLT_ID(flt) (flt)->config->id +#define FLT_CONF(flt) (flt)->config->conf +#define FLT_OPS(flt) (flt)->config->ops + +/* Useful macros to access per-channel values. It can be safely used inside + * filters. */ +#define CHN_IDX(chn) (((chn)->flags & CF_ISRESP) == CF_ISRESP) +#define FLT_STRM_OFF(s, chn) (strm_flt(s)->offset[CHN_IDX(chn)]) +#define FLT_OFF(flt, chn) ((flt)->offset[CHN_IDX(chn)]) + +#define HAS_FILTERS(strm) ((strm)->strm_flt.flags & STRM_FLT_FL_HAS_FILTERS) + +#define HAS_REQ_DATA_FILTERS(strm) ((strm)->strm_flt.nb_req_data_filters != 0) +#define HAS_RSP_DATA_FILTERS(strm) ((strm)->strm_flt.nb_rsp_data_filters != 0) +#define HAS_DATA_FILTERS(strm, chn) (((chn)->flags & CF_ISRESP) ? HAS_RSP_DATA_FILTERS(strm) : HAS_REQ_DATA_FILTERS(strm)) + +#define IS_REQ_DATA_FILTER(flt) ((flt)->flags & FLT_FL_IS_REQ_DATA_FILTER) +#define IS_RSP_DATA_FILTER(flt) ((flt)->flags & FLT_FL_IS_RSP_DATA_FILTER) +#define IS_DATA_FILTER(flt, chn) (((chn)->flags & CF_ISRESP) ? IS_RSP_DATA_FILTER(flt) : IS_REQ_DATA_FILTER(flt)) + +#define FLT_STRM_CB(strm, call) \ + do { \ + if (HAS_FILTERS(strm)) { call; } \ + } while (0) + +#define FLT_STRM_DATA_CB_IMPL_1(strm, chn, call, default_ret) \ + (HAS_DATA_FILTERS(strm, chn) ? call : default_ret) +#define FLT_STRM_DATA_CB_IMPL_2(strm, chn, call, default_ret, on_error) \ + ({ \ + int _ret; \ + if (HAS_DATA_FILTERS(strm, chn)) { \ + _ret = call; \ + if (_ret < 0) { on_error; } \ + } \ + else \ + _ret = default_ret; \ + _ret; \ + }) +#define FLT_STRM_DATA_CB_IMPL_3(strm, chn, call, default_ret, on_error, on_wait) \ + ({ \ + int _ret; \ + if (HAS_DATA_FILTERS(strm, chn)) { \ + _ret = call; \ + if (_ret < 0) { on_error; } \ + if (!_ret) { on_wait; } \ + } \ + else \ + _ret = default_ret; \ + _ret; \ + }) + +#define FLT_STRM_DATA_CB_IMPL_X(strm, chn, call, A, B, C, DATA_CB_IMPL, ...) \ + DATA_CB_IMPL + +#define FLT_STRM_DATA_CB(strm, chn, call, ...) \ + FLT_STRM_DATA_CB_IMPL_X(strm, chn, call, ##__VA_ARGS__, \ + FLT_STRM_DATA_CB_IMPL_3(strm, chn, call, ##__VA_ARGS__), \ + FLT_STRM_DATA_CB_IMPL_2(strm, chn, call, ##__VA_ARGS__), \ + FLT_STRM_DATA_CB_IMPL_1(strm, chn, call, ##__VA_ARGS__)) + +void flt_deinit(struct proxy *p); +int flt_check(struct proxy *p); + +int flt_stream_start(struct stream *s); +void flt_stream_stop(struct stream *s); +int flt_set_stream_backend(struct stream *s, struct proxy *be); +int flt_stream_init(struct stream *s); +void flt_stream_release(struct stream *s, int only_backend); +void flt_stream_check_timeouts(struct stream *s); + +int flt_http_payload(struct stream *s, struct http_msg *msg, unsigned int len); +int flt_http_end(struct stream *s, struct http_msg *msg); + +void flt_http_reset(struct stream *s, struct http_msg *msg); +void flt_http_reply(struct stream *s, short status, const struct buffer *msg); + +int flt_start_analyze(struct stream *s, struct channel *chn, unsigned int an_bit); +int flt_pre_analyze(struct stream *s, struct channel *chn, unsigned int an_bit); +int flt_post_analyze(struct stream *s, struct channel *chn, unsigned int an_bit); +int flt_analyze_http_headers(struct stream *s, struct channel *chn, unsigned int an_bit); +int flt_end_analyze(struct stream *s, struct channel *chn, unsigned int an_bit); + +int flt_xfer_data(struct stream *s, struct channel *chn, unsigned int an_bit); + +void flt_register_keywords(struct flt_kw_list *kwl); +struct flt_kw *flt_find_kw(const char *kw); +void flt_dump_kws(char **out); +void list_filters(FILE *out); + +/* Helper function that returns the "global" state of filters attached to a + * stream. */ +static inline struct strm_flt * +strm_flt(struct stream *s) +{ + return &s->strm_flt; +} + +/* Registers a filter to a channel. If a filter was already registered, this + * function do nothing. Once registered, the filter becomes a "data" filter for + * this channel. */ +static inline void +register_data_filter(struct stream *s, struct channel *chn, struct filter *filter) +{ + if (!IS_DATA_FILTER(filter, chn)) { + if (chn->flags & CF_ISRESP) { + filter->flags |= FLT_FL_IS_RSP_DATA_FILTER; + strm_flt(s)->nb_rsp_data_filters++; + } + else { + filter->flags |= FLT_FL_IS_REQ_DATA_FILTER; + strm_flt(s)->nb_req_data_filters++; + } + } +} + +/* Unregisters a "data" filter from a channel. */ +static inline void +unregister_data_filter(struct stream *s, struct channel *chn, struct filter *filter) +{ + if (IS_DATA_FILTER(filter, chn)) { + if (chn->flags & CF_ISRESP) { + filter->flags &= ~FLT_FL_IS_RSP_DATA_FILTER; + strm_flt(s)->nb_rsp_data_filters--; + + } + else { + filter->flags &= ~FLT_FL_IS_REQ_DATA_FILTER; + strm_flt(s)->nb_req_data_filters--; + } + } +} + +/* This function must be called when a filter alter payload data. It updates + * offsets of all previous filters. Do not call this function when a filter + * change the size of payload data leads to an undefined behavior. + * + * This is the filter's responsiblitiy to update data itself. + */ +static inline void +flt_update_offsets(struct filter *filter, struct channel *chn, int len) +{ + struct stream *s = chn_strm(chn); + struct filter *f; + + list_for_each_entry(f, &strm_flt(s)->filters, list) { + if (f == filter) + break; + FLT_OFF(f, chn) += len; + } +} + +#endif /* _HAPROXY_FILTERS_H */ diff --git a/include/haproxy/fix-t.h b/include/haproxy/fix-t.h new file mode 100644 index 0000000..4b4de55 --- /dev/null +++ b/include/haproxy/fix-t.h @@ -0,0 +1,70 @@ +/* + * include/haproxy/fix-t.h + * This file contains structure declarations for FIX protocol. + * + * Copyright 2020 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FIX_T_H +#define _HAPROXY_FIX_T_H + +#include <import/ist.h> + +/* + * FIX messages are composed by a list of Tag=Value separated by a 'delimiter' + */ +#define FIX_DELIMITER 0x01 + +/* + * know FIX version strings + */ +#define FIX_4_0 (ist("FIX.4.0")) +#define FIX_4_1 (ist("FIX.4.1")) +#define FIX_4_2 (ist("FIX.4.2")) +#define FIX_4_3 (ist("FIX.4.3")) +#define FIX_4_4 (ist("FIX.4.4")) +#define FIX_5_0 (ist("FIXT.1.1")) +/* FIX_5_0SP1 and FIX_5_0SP2 have the same version string than FIX5_0 */ + +/* + * Supported FIX tag ID + */ +#define FIX_TAG_BeginString 8 +#define FIX_TAG_BodyLength 9 +#define FIX_TAG_CheckSum 10 +#define FIX_TAG_MsgType 35 +#define FIX_TAG_SenderCompID 49 +#define FIX_TAG_TargetCompID 56 + + +#define FIX_MSG_MINSIZE 26 /* Minimal length for a FIX Message */ +#define FIX_CHKSUM_SIZE 7 /* Length of the CheckSum tag (10=NNN<delim>) */ +/* + * return code when parsing / validating FIX messages + */ +#define FIX_INVALID_MESSAGE -1 +#define FIX_NEED_MORE_DATA 0 +#define FIX_VALID_MESSAGE 1 + +#endif /* _HAPROXY_FIX_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/fix.h b/include/haproxy/fix.h new file mode 100644 index 0000000..94aa815 --- /dev/null +++ b/include/haproxy/fix.h @@ -0,0 +1,97 @@ +/* + * include/haproxy/fix.h + * This file contains functions and macros declarations for FIX protocol decoding. + * + * Copyright 2020 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FIX_H +#define _HAPROXY_FIX_H + +#include <import/ist.h> + +#include <haproxy/fix-t.h> +#include <haproxy/tools.h> + +unsigned int fix_check_id(const struct ist str, const struct ist version); +int fix_validate_message(const struct ist msg); +struct ist fix_tag_value(const struct ist msg, unsigned int tagid); + +/* + * Return the FIX version string (one of FIX_X_Y macros) corresponding to + * <str> or IST_NULL if not found. + */ +static inline struct ist fix_version(const struct ist str) +{ + /* 7 is the minimal size for the FIX version string */ + if (istlen(str) < 7) + return IST_NULL; + + if (isteq(FIX_4_0, str)) + return FIX_4_0; + else if (isteq(FIX_4_1, str)) + return FIX_4_1; + else if (isteq(FIX_4_2, str)) + return FIX_4_2; + else if (isteq(FIX_4_3, str)) + return FIX_4_3; + else if (isteq(FIX_4_4, str)) + return FIX_4_4; + else if (isteq(FIX_5_0, str)) + return FIX_5_0; + + return IST_NULL; +} + +/* + * Return the FIX tag ID corresponding to <tag> if one found or 0 if not. + * + * full list of tag ID available here, just in case we need to support + * more "string" equivalent in the future: + * https://www.onixs.biz/fix-dictionary/4.2/fields_by_tag.html + */ +static inline unsigned int fix_tagid(const struct ist tag) +{ + unsigned id = fix_check_id(tag, IST_NULL); + + if (id) + return id; + + else if (isteqi(tag, ist("MsgType"))) + return FIX_TAG_MsgType; + else if (isteqi(tag, ist("CheckSum"))) + return FIX_TAG_CheckSum; + else if (isteqi(tag, ist("BodyLength"))) + return FIX_TAG_BodyLength; + else if (isteqi(tag, ist("TargetCompID"))) + return FIX_TAG_TargetCompID; + else if (isteqi(tag, ist("BeginString"))) + return FIX_TAG_BeginString; + else if (isteqi(tag, ist("SenderCompID"))) + return FIX_TAG_SenderCompID; + + return 0; +} + +#endif /* _HAPROXY_FIX_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/flt_http_comp.h b/include/haproxy/flt_http_comp.h new file mode 100644 index 0000000..56f984a --- /dev/null +++ b/include/haproxy/flt_http_comp.h @@ -0,0 +1,28 @@ +/* + * include/haproxy/flt_http_comp.h + * This file defines function prototypes for the compression filter. + * + * Copyright (C) 2015 Qualys Inc., Christopher Faulet <cfaulet@qualys.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _HAPROXY_FLT_HTTP_COMP_H +#define _HAPROXY_FLT_HTTP_COMP_H + +#include <haproxy/proxy-t.h> + +int check_implicit_http_comp_flt(struct proxy *proxy); + +#endif // _HAPROXY_FLT_HTTP_COMP_H diff --git a/include/haproxy/freq_ctr-t.h b/include/haproxy/freq_ctr-t.h new file mode 100644 index 0000000..d5f1a89 --- /dev/null +++ b/include/haproxy/freq_ctr-t.h @@ -0,0 +1,45 @@ +/* + * include/haproxy/freq_ctr.h + * This file contains structure declarations for frequency counters. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FREQ_CTR_T_H +#define _HAPROXY_FREQ_CTR_T_H + +#include <haproxy/api-t.h> + +/* The generic freq_ctr counter counts a rate of events per period, where the + * period has to be known by the user. The period is measured in ticks and + * must be at least 2 ticks long. This form is slightly more CPU intensive for + * reads than the per-second form as it involves a divide. + */ +struct freq_ctr { + unsigned int curr_tick; /* start date of current period (wrapping ticks) */ + unsigned int curr_ctr; /* cumulated value for current period */ + unsigned int prev_ctr; /* value for last period */ +}; + +#endif /* _HAPROXY_FREQ_CTR_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/freq_ctr.h b/include/haproxy/freq_ctr.h new file mode 100644 index 0000000..f3f6903 --- /dev/null +++ b/include/haproxy/freq_ctr.h @@ -0,0 +1,402 @@ +/* + * include/haproxy/freq_ctr.h + * This file contains macros and inline functions for frequency counters. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FREQ_CTR_H +#define _HAPROXY_FREQ_CTR_H + +#include <haproxy/api.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/intops.h> +#include <haproxy/ticks.h> + +/* exported functions from freq_ctr.c */ +ullong freq_ctr_total(const struct freq_ctr *ctr, uint period, int pend); +int freq_ctr_overshoot_period(const struct freq_ctr *ctr, uint period, uint freq); +uint update_freq_ctr_period_slow(struct freq_ctr *ctr, uint period, uint inc); + +/* Update a frequency counter by <inc> incremental units. It is automatically + * rotated if the period is over. It is important that it correctly initializes + * a null area. + */ +static inline uint update_freq_ctr_period(struct freq_ctr *ctr, uint period, uint inc) +{ + uint curr_tick; + + /* our local clock (now_ms) is most of the time strictly equal to + * global_now_ms, and during the edge of the millisecond, global_now_ms + * might have been pushed further by another thread. Given that + * accessing this shared variable is extremely expensive, we first try + * to use our local date, which will be good almost every time. And we + * only switch to the global clock when we're out of the period so as + * to never put a date in the past there. + */ + curr_tick = HA_ATOMIC_LOAD(&ctr->curr_tick); + if (likely(now_ms - curr_tick < period)) + return HA_ATOMIC_ADD_FETCH(&ctr->curr_ctr, inc); + + return update_freq_ctr_period_slow(ctr, period, inc); +} + +/* Update a 1-sec frequency counter by <inc> incremental units. It is automatically + * rotated if the period is over. It is important that it correctly initializes + * a null area. + */ +static inline unsigned int update_freq_ctr(struct freq_ctr *ctr, unsigned int inc) +{ + return update_freq_ctr_period(ctr, MS_TO_TICKS(1000), inc); +} + +/* Reads a frequency counter taking history into account for missing time in + * current period. The period has to be passed in number of ticks and must + * match the one used to feed the counter. The counter value is reported for + * current global date. The return value has the same precision as one input + * data sample, so low rates over the period will be inaccurate but still + * appropriate for max checking. One trick we use for low values is to specially + * handle the case where the rate is between 0 and 1 in order to avoid flapping + * while waiting for the next event. + * + * For immediate limit checking, it's recommended to use freq_ctr_period_remain() + * instead which does not have the flapping correction, so that even frequencies + * as low as one event/period are properly handled. + */ +static inline uint read_freq_ctr_period(const struct freq_ctr *ctr, uint period) +{ + ullong total = freq_ctr_total(ctr, period, -1); + + return div64_32(total, period); +} + +/* same as read_freq_ctr_period() above except that floats are used for the + * output so that low rates can be more precise. + */ +static inline double read_freq_ctr_period_flt(const struct freq_ctr *ctr, uint period) +{ + ullong total = freq_ctr_total(ctr, period, -1); + + return (double)total / (double)period; +} + +/* Read a 1-sec frequency counter taking history into account for missing time + * in current period. + */ +static inline unsigned int read_freq_ctr(const struct freq_ctr *ctr) +{ + return read_freq_ctr_period(ctr, MS_TO_TICKS(1000)); +} + +/* same as read_freq_ctr() above except that floats are used for the + * output so that low rates can be more precise. + */ +static inline double read_freq_ctr_flt(const struct freq_ctr *ctr) +{ + return read_freq_ctr_period_flt(ctr, MS_TO_TICKS(1000)); +} + +/* Returns the number of remaining events that can occur on this freq counter + * while respecting <freq> events per period, and taking into account that + * <pend> events are already known to be pending. Returns 0 if limit was reached. + */ +static inline uint freq_ctr_remain_period(const struct freq_ctr *ctr, uint period, uint freq, uint pend) +{ + ullong total = freq_ctr_total(ctr, period, pend); + uint avg = div64_32(total, period); + + if (avg > freq) + avg = freq; + return freq - avg; +} + +/* returns the number of remaining events that can occur on this freq counter + * while respecting <freq> and taking into account that <pend> events are + * already known to be pending. Returns 0 if limit was reached. + */ +static inline unsigned int freq_ctr_remain(const struct freq_ctr *ctr, unsigned int freq, unsigned int pend) +{ + return freq_ctr_remain_period(ctr, MS_TO_TICKS(1000), freq, pend); +} + +/* return the expected wait time in ms before the next event may occur, + * respecting frequency <freq>, and assuming there may already be some pending + * events. It returns zero if we can proceed immediately, otherwise the wait + * time, which will be rounded down 1ms for better accuracy, with a minimum + * of one ms. + */ +static inline uint next_event_delay_period(const struct freq_ctr *ctr, uint period, uint freq, uint pend) +{ + ullong total = freq_ctr_total(ctr, period, pend); + ullong limit = (ullong)freq * period; + uint wait; + + if (total < limit) + return 0; + + /* too many events already, let's count how long to wait before they're + * processed. For this we'll subtract from the number of pending events + * the ones programmed for the current period, to know how long to wait + * for the next period. Each event takes period/freq ticks. + */ + total -= limit; + wait = div64_32(total, (freq ? freq : 1)); + return MAX(wait, 1); +} + +/* Returns the expected wait time in ms before the next event may occur, + * respecting frequency <freq> over 1 second, and assuming there may already be + * some pending events. It returns zero if we can proceed immediately, otherwise + * the wait time, which will be rounded down 1ms for better accuracy, with a + * minimum of one ms. + */ +static inline unsigned int next_event_delay(const struct freq_ctr *ctr, unsigned int freq, unsigned int pend) +{ + return next_event_delay_period(ctr, MS_TO_TICKS(1000), freq, pend); +} + +/* While the functions above report average event counts per period, we are + * also interested in average values per event. For this we use a different + * method. The principle is to rely on a long tail which sums the new value + * with a fraction of the previous value, resulting in a sliding window of + * infinite length depending on the precision we're interested in. + * + * The idea is that we always keep (N-1)/N of the sum and add the new sampled + * value. The sum over N values can be computed with a simple program for a + * constant value 1 at each iteration : + * + * N + * ,--- + * \ N - 1 e - 1 + * > ( --------- )^x ~= N * ----- + * / N e + * '--- + * x = 1 + * + * Note: I'm not sure how to demonstrate this but at least this is easily + * verified with a simple program, the sum equals N * 0.632120 for any N + * moderately large (tens to hundreds). + * + * Inserting a constant sample value V here simply results in : + * + * sum = V * N * (e - 1) / e + * + * But we don't want to integrate over a small period, but infinitely. Let's + * cut the infinity in P periods of N values. Each period M is exactly the same + * as period M-1 with a factor of ((N-1)/N)^N applied. A test shows that given a + * large N : + * + * N - 1 1 + * ( ------- )^N ~= --- + * N e + * + * Our sum is now a sum of each factor times : + * + * N*P P + * ,--- ,--- + * \ N - 1 e - 1 \ 1 + * > v ( --------- )^x ~= VN * ----- * > --- + * / N e / e^x + * '--- '--- + * x = 1 x = 0 + * + * For P "large enough", in tests we get this : + * + * P + * ,--- + * \ 1 e + * > --- ~= ----- + * / e^x e - 1 + * '--- + * x = 0 + * + * This simplifies the sum above : + * + * N*P + * ,--- + * \ N - 1 + * > v ( --------- )^x = VN + * / N + * '--- + * x = 1 + * + * So basically by summing values and applying the last result an (N-1)/N factor + * we just get N times the values over the long term, so we can recover the + * constant value V by dividing by N. In order to limit the impact of integer + * overflows, we'll use this equivalence which saves us one multiply : + * + * N - 1 1 x0 + * x1 = x0 * ------- = x0 * ( 1 - --- ) = x0 - ---- + * N N N + * + * And given that x0 is discrete here we'll have to saturate the values before + * performing the divide, so the value insertion will become : + * + * x0 + N - 1 + * x1 = x0 - ------------ + * N + * + * A value added at the entry of the sliding window of N values will thus be + * reduced to 1/e or 36.7% after N terms have been added. After a second batch, + * it will only be 1/e^2, or 13.5%, and so on. So practically speaking, each + * old period of N values represents only a quickly fading ratio of the global + * sum : + * + * period ratio + * 1 36.7% + * 2 13.5% + * 3 4.98% + * 4 1.83% + * 5 0.67% + * 6 0.25% + * 7 0.09% + * 8 0.033% + * 9 0.012% + * 10 0.0045% + * + * So after 10N samples, the initial value has already faded out by a factor of + * 22026, which is quite fast. If the sliding window is 1024 samples wide, it + * means that a sample will only count for 1/22k of its initial value after 10k + * samples went after it, which results in half of the value it would represent + * using an arithmetic mean. The benefit of this method is that it's very cheap + * in terms of computations when N is a power of two. This is very well suited + * to record response times as large values will fade out faster than with an + * arithmetic mean and will depend on sample count and not time. + * + * Demonstrating all the above assumptions with maths instead of a program is + * left as an exercise for the reader. + */ + +/* Adds sample value <v> to sliding window sum <sum> configured for <n> samples. + * The sample is returned. Better if <n> is a power of two. This function is + * thread-safe. + */ +static inline unsigned int swrate_add(unsigned int *sum, unsigned int n, unsigned int v) +{ + unsigned int new_sum, old_sum; + + old_sum = *sum; + do { + new_sum = old_sum - (old_sum + n - 1) / n + v; + } while (!HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax()); + return new_sum; +} + +/* Adds sample value <v> to sliding window sum <sum> configured for <n> samples. + * The sample is returned. Better if <n> is a power of two. This function is + * thread-safe. + * This function should give better accuracy than swrate_add when number of + * samples collected is lower than nominal window size. In such circumstances + * <n> should be set to 0. + */ +static inline unsigned int swrate_add_dynamic(unsigned int *sum, unsigned int n, unsigned int v) +{ + unsigned int new_sum, old_sum; + + old_sum = *sum; + do { + new_sum = old_sum - (n ? (old_sum + n - 1) / n : 0) + v; + } while (!HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax()); + return new_sum; +} + +/* Adds sample value <v> spanning <s> samples to sliding window sum <sum> + * configured for <n> samples, where <n> is supposed to be "much larger" than + * <s>. The sample is returned. Better if <n> is a power of two. Note that this + * is only an approximate. Indeed, as can be seen with two samples only over a + * 8-sample window, the original function would return : + * sum1 = sum - (sum + 7) / 8 + v + * sum2 = sum1 - (sum1 + 7) / 8 + v + * = (sum - (sum + 7) / 8 + v) - (sum - (sum + 7) / 8 + v + 7) / 8 + v + * ~= 7sum/8 - 7/8 + v - sum/8 + sum/64 - 7/64 - v/8 - 7/8 + v + * ~= (3sum/4 + sum/64) - (7/4 + 7/64) + 15v/8 + * + * while the function below would return : + * sum = sum + 2*v - (sum + 8) * 2 / 8 + * = 3sum/4 + 2v - 2 + * + * this presents an error of ~ (sum/64 + 9/64 + v/8) = (sum+n+1)/(n^s) + v/n + * + * Thus the simplified function effectively replaces a part of the history with + * a linear sum instead of applying the exponential one. But as long as s/n is + * "small enough", the error fades away and remains small for both small and + * large values of n and s (typically < 0.2% measured). This function is + * thread-safe. + */ +static inline unsigned int swrate_add_scaled(unsigned int *sum, unsigned int n, unsigned int v, unsigned int s) +{ + unsigned int new_sum, old_sum; + + old_sum = *sum; + do { + new_sum = old_sum + v * s - div64_32((unsigned long long)old_sum * s + n - 1, n); + } while (!HA_ATOMIC_CAS(sum, &old_sum, new_sum) && __ha_cpu_relax()); + return new_sum; +} + +/* opportunistic versions of the functions above: an attempt is made to update + * the value, but in case of contention, it's not retried. This is fine when + * rough estimates are needed and speed is preferred over accuracy. + */ + +static inline uint swrate_add_opportunistic(uint *sum, uint n, uint v) +{ + uint new_sum, old_sum; + + old_sum = *sum; + new_sum = old_sum - (old_sum + n - 1) / n + v; + HA_ATOMIC_CAS(sum, &old_sum, new_sum); + return new_sum; +} + +static inline uint swrate_add_dynamic_opportunistic(uint *sum, uint n, uint v) +{ + uint new_sum, old_sum; + + old_sum = *sum; + new_sum = old_sum - (n ? (old_sum + n - 1) / n : 0) + v; + HA_ATOMIC_CAS(sum, &old_sum, new_sum); + return new_sum; +} + +static inline uint swrate_add_scaled_opportunistic(uint *sum, uint n, uint v, uint s) +{ + uint new_sum, old_sum; + + old_sum = *sum; + new_sum = old_sum + v * s - div64_32((unsigned long long)old_sum * s + n - 1, n); + HA_ATOMIC_CAS(sum, &old_sum, new_sum); + return new_sum; +} + +/* Returns the average sample value for the sum <sum> over a sliding window of + * <n> samples. Better if <n> is a power of two. It must be the same <n> as the + * one used above in all additions. + */ +static inline unsigned int swrate_avg(unsigned int sum, unsigned int n) +{ + return (sum + n - 1) / n; +} + +#endif /* _HAPROXY_FREQ_CTR_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/frontend.h b/include/haproxy/frontend.h new file mode 100644 index 0000000..8cd1a0a --- /dev/null +++ b/include/haproxy/frontend.h @@ -0,0 +1,38 @@ +/* + * include/haproxy/frontend.h + * This file declares frontend-specific functions. + * + * Copyright (C) 2000-2011 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_FRONTEND_H +#define _HAPROXY_FRONTEND_H + +#include <haproxy/stream-t.h> + +int frontend_accept(struct stream *s); + +int increment_actconn(); + +#endif /* _HAPROXY_FRONTEND_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/global-t.h b/include/haproxy/global-t.h new file mode 100644 index 0000000..9b3cd78 --- /dev/null +++ b/include/haproxy/global-t.h @@ -0,0 +1,251 @@ +/* + * include/haproxy/global-t.h + * Global types and macros. Please avoid adding more stuff here! + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_GLOBAL_T_H +#define _HAPROXY_GLOBAL_T_H + +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/freq_ctr-t.h> + +/* modes of operation (global.mode) */ +#define MODE_DEBUG 0x01 +#define MODE_DAEMON 0x02 +#define MODE_QUIET 0x04 +#define MODE_CHECK 0x08 +#define MODE_VERBOSE 0x10 +#define MODE_STARTING 0x20 +#define MODE_FOREGROUND 0x40 +#define MODE_MWORKER 0x80 /* Master Worker */ +#define MODE_MWORKER_WAIT 0x100 /* Master Worker wait mode */ +#define MODE_ZERO_WARNING 0x200 /* warnings cause a failure */ +#define MODE_DIAG 0x400 /* extra warnings */ +#define MODE_CHECK_CONDITION 0x800 /* -cc mode */ +#define MODE_STOPPING 0x1000 /* the process is in the deinit phase, the event loop is not running anymore. */ +#define MODE_DUMP_LIBS 0x2000 /* dump loaded libraries at the end of init phase */ +#define MODE_DUMP_KWD 0x4000 /* dump registered keywords (see kwd_dump for the list) */ +#define MODE_DUMP_CFG 0x8000 /* dump the configuration file */ +#define MODE_DUMP_NB_L 0x10000 /* dump line numbers when the configuration file is dump */ + +/* list of last checks to perform, depending on config options */ +#define LSTCHK_CAP_BIND 0x00000001 /* check that we can bind to any port */ +#define LSTCHK_NETADM 0x00000002 /* check that we have CAP_NET_ADMIN */ + +/* Global tuning options */ +/* available polling mechanisms */ +#define GTUNE_USE_SELECT (1<<0) +#define GTUNE_USE_POLL (1<<1) +#define GTUNE_USE_EPOLL (1<<2) +#define GTUNE_USE_KQUEUE (1<<3) +/* platform-specific options */ +#define GTUNE_USE_SPLICE (1<<4) +#define GTUNE_USE_GAI (1<<5) +#define GTUNE_LIMITED_QUIC (1<<6) +#define GTUNE_RESOLVE_DONTFAIL (1<<7) + +#define GTUNE_SOCKET_TRANSFER (1<<8) +#define GTUNE_NOEXIT_ONFAILURE (1<<9) +#define GTUNE_USE_SYSTEMD (1<<10) + +#define GTUNE_BUSY_POLLING (1<<11) +/* (1<<12) unused */ +#define GTUNE_SET_DUMPABLE (1<<13) +#define GTUNE_USE_EVPORTS (1<<14) +#define GTUNE_STRICT_LIMITS (1<<15) +#define GTUNE_INSECURE_FORK (1<<16) +#define GTUNE_INSECURE_SETUID (1<<17) +#define GTUNE_FD_ET (1<<18) +#define GTUNE_SCHED_LOW_LATENCY (1<<19) +#define GTUNE_IDLE_POOL_SHARED (1<<20) +#define GTUNE_DISABLE_H2_WEBSOCKET (1<<21) +#define GTUNE_DISABLE_ACTIVE_CLOSE (1<<22) +#define GTUNE_QUICK_EXIT (1<<23) +#define GTUNE_QUIC_SOCK_PER_CONN (1<<24) +#define GTUNE_NO_QUIC (1<<25) +#define GTUNE_USE_FAST_FWD (1<<26) +#define GTUNE_LISTENER_MQ_FAIR (1<<27) +#define GTUNE_LISTENER_MQ_OPT (1<<28) +#define GTUNE_LISTENER_MQ_ANY (GTUNE_LISTENER_MQ_FAIR | GTUNE_LISTENER_MQ_OPT) + +#define NO_ZERO_COPY_FWD 0x0001 /* Globally disable zero-copy FF */ +#define NO_ZERO_COPY_FWD_PT 0x0002 /* disable zero-copy FF for PT (recv & send are disabled automatically) */ +#define NO_ZERO_COPY_FWD_H1_RCV 0x0004 /* disable zero-copy FF for H1 on received */ +#define NO_ZERO_COPY_FWD_H1_SND 0x0008 /* disable zero-copy FF for H1 on send */ +#define NO_ZERO_COPY_FWD_H2_RCV 0x0010 /* disable zero-copy FF for H2 on received */ +#define NO_ZERO_COPY_FWD_H2_SND 0x0020 /* disable zero-copy FF for H2 on send */ +#define NO_ZERO_COPY_FWD_QUIC_RCV 0x0040 /* disable zero-copy FF for QUIC on received */ +#define NO_ZERO_COPY_FWD_QUIC_SND 0x0080 /* disable zero-copy FF for QUIC on send */ +#define NO_ZERO_COPY_FWD_FCGI_RCV 0x0100 /* disable zero-copy FF for FCGI on received */ +#define NO_ZERO_COPY_FWD_FCGI_SND 0x0200 /* disable zero-copy FF for FCGI on send */ + + +extern int cluster_secret_isset; /* non zero means a cluster secret was initialized */ + +/* SSL server verify mode */ +enum { + SSL_SERVER_VERIFY_NONE = 0, + SSL_SERVER_VERIFY_REQUIRED = 1, +}; + +/* bit values to go with "warned" above */ +#define WARN_ANY 0x00000001 /* any warning was emitted */ +#define WARN_FORCECLOSE_DEPRECATED 0x00000002 +#define WARN_EXEC_PATH 0x00000004 /* executable path already reported */ + +/* put there the forward declarations needed for global.h */ +struct proxy; + +/* FIXME : this will have to be redefined correctly */ +struct global { + int uid; + int gid; + int external_check; /* 0=disabled, 1=enabled, 2=enabled with env */ + int nbthread; + int mode; + unsigned int hard_stop_after; /* maximum time allowed to perform a soft-stop */ + unsigned int grace_delay; /* grace delay between SIGUSR1 and soft-stop */ + unsigned int close_spread_time; /* time window during which connection closing is spread */ + unsigned int close_spread_end; /* end of close spread window */ + int maxconn, hardmaxconn; + int maxsslconn; + int ssl_session_max_cost; /* how many bytes an SSL session may cost */ + int ssl_handshake_max_cost; /* how many bytes an SSL handshake may use */ + int ssl_used_frontend; /* non-zero if SSL is used in a frontend */ + int ssl_used_backend; /* non-zero if SSL is used in a backend */ + int ssl_used_async_engines; /* number of used async engines */ + unsigned int ssl_server_verify; /* default verify mode on servers side */ + int comp_rate_lim; /* HTTP compression rate limit */ + int maxpipes; /* max # of pipes */ + int maxsock; /* max # of sockets */ + int rlimit_nofile; /* default ulimit-n value : 0=unset */ + int rlimit_memmax_all; /* default all-process memory limit in megs ; 0=unset */ + int rlimit_memmax; /* default per-process memory limit in megs ; 0=unset */ + long maxzlibmem; /* max RAM for zlib in bytes */ + int nbtgroups; /* number of thread groups (IDs start at 1) */ + int spread_checks; + int max_spread_checks; + int max_syslog_len; + char *chroot; + char *pidfile; + char *node, *desc; /* node name & description */ + int localpeer_cmdline; /* whether or not the commandline "-L" was set */ + int fd_hard_limit; /* hard limit on ulimit-n : 0=unset */ + struct buffer log_tag; /* name for syslog */ + struct list loggers; /* one per 'log' directive */ + char *log_send_hostname; /* set hostname in syslog header */ + char *server_state_base; /* path to a directory where server state files can be found */ + char *server_state_file; /* path to the file where server states are loaded from */ + unsigned char cluster_secret[16]; /* 128 bits of an SHA1 digest of a secret defined as ASCII string */ + struct { + int maxpollevents; /* max number of poll events at once */ + int maxaccept; /* max number of consecutive accept() */ + int options; /* various tuning options */ + int runqueue_depth;/* max number of tasks to run at once */ + int recv_enough; /* how many input bytes at once are "enough" */ + int bufsize; /* buffer size in bytes, defaults to BUFSIZE */ + int maxrewrite; /* buffer max rewrite size in bytes, defaults to MAXREWRITE */ + int reserved_bufs; /* how many buffers can only be allocated for response */ + int buf_limit; /* if not null, how many total buffers may only be allocated */ + int client_sndbuf; /* set client sndbuf to this value if not null */ + int client_rcvbuf; /* set client rcvbuf to this value if not null */ + int server_sndbuf; /* set server sndbuf to this value if not null */ + int server_rcvbuf; /* set server rcvbuf to this value if not null */ + int frontend_sndbuf; /* set frontend dgram sndbuf to this value if not null */ + int frontend_rcvbuf; /* set frontend dgram rcvbuf to this value if not null */ + int backend_sndbuf; /* set backend dgram sndbuf to this value if not null */ + int backend_rcvbuf; /* set backend dgram rcvbuf to this value if not null */ + int pipesize; /* pipe size in bytes, system defaults if zero */ + int max_http_hdr; /* max number of HTTP headers, use MAX_HTTP_HDR if zero */ + int requri_len; /* max len of request URI, use REQURI_LEN if zero */ + int cookie_len; /* max length of cookie captures */ + int pattern_cache; /* max number of entries in the pattern cache. */ + int sslcachesize; /* SSL cache size in session, defaults to 20000 */ + int comp_maxlevel; /* max HTTP compression level */ + int pool_low_ratio; /* max ratio of FDs used before we stop using new idle connections */ + int pool_high_ratio; /* max ratio of FDs used before we start killing idle connections when creating new connections */ + int pool_low_count; /* max number of opened fd before we stop using new idle connections */ + int pool_high_count; /* max number of opened fd before we start killing idle connections when creating new connections */ + size_t pool_cache_size; /* per-thread cache size per pool (defaults to CONFIG_HAP_POOL_CACHE_SIZE) */ + unsigned short idle_timer; /* how long before an empty buffer is considered idle (ms) */ + unsigned short no_zero_copy_fwd; /* Flags to disable zero-copy fast-forwarding (global & per-protocols) */ + int nb_stk_ctr; /* number of stick counters, defaults to MAX_SESS_STKCTR */ + int default_shards; /* default shards for listeners, or -1 (by-thread) or -2 (by-group) */ + uint max_checks_per_thread; /* if >0, no more than this concurrent checks per thread */ +#ifdef USE_QUIC + unsigned int quic_backend_max_idle_timeout; + unsigned int quic_frontend_max_idle_timeout; + unsigned int quic_frontend_max_streams_bidi; + unsigned int quic_retry_threshold; + unsigned int quic_reorder_ratio; + unsigned int quic_streams_buf; + unsigned int quic_max_frame_loss; +#endif /* USE_QUIC */ + } tune; + struct { + char *prefix; /* path prefix of unix bind socket */ + struct { /* UNIX socket permissions */ + uid_t uid; /* -1 to leave unchanged */ + gid_t gid; /* -1 to leave unchanged */ + mode_t mode; /* 0 to leave unchanged */ + } ux; + } unix_bind; + struct proxy *cli_fe; /* the frontend holding the stats settings */ + int numa_cpu_mapping; + int prealloc_fd; + int cfg_curr_line; /* line number currently being parsed */ + const char *cfg_curr_file; /* config file currently being parsed or NULL */ + char *cfg_curr_section; /* config section name currently being parsed or NULL */ + + /* The info above is config stuff, it doesn't change during the process' life */ + /* A number of the elements below are updated by all threads in real time and + * suffer high contention, so we need to put them in their own cache lines, if + * possible grouped by changes. + */ + ALWAYS_ALIGN(64); + struct freq_ctr conn_per_sec; + struct freq_ctr sess_per_sec; + struct freq_ctr ssl_per_sec; + struct freq_ctr ssl_fe_keys_per_sec; + struct freq_ctr ssl_be_keys_per_sec; + struct freq_ctr comp_bps_in; /* bytes per second, before http compression */ + struct freq_ctr comp_bps_out; /* bytes per second, after http compression */ + uint sslconns, totalsslconns; /* active, total # of SSL conns */ + int cps_lim, cps_max; + int sps_lim, sps_max; + int ssl_lim, ssl_max; + int ssl_fe_keys_max, ssl_be_keys_max; + unsigned int shctx_lookups, shctx_misses; + unsigned int req_count; /* request counter (HTTP or TCP session) for logs and unique_id */ + int last_checks; + uint32_t anon_key; + + /* leave this at the end to make sure we don't share this cache line by accident */ + ALWAYS_ALIGN(64); +}; + +#endif /* _HAPROXY_GLOBAL_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/global.h b/include/haproxy/global.h new file mode 100644 index 0000000..2e7fa6b --- /dev/null +++ b/include/haproxy/global.h @@ -0,0 +1,98 @@ +/* + * include/haproxy/global.h + * Exported global variables and functions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_GLOBAL_H +#define _HAPROXY_GLOBAL_H + +#include <haproxy/api-t.h> +#include <haproxy/global-t.h> + +extern char *build_features; +extern struct global global; +extern int pid; /* current process id */ +extern int actconn; /* # of active sessions */ +extern int listeners; +extern int jobs; /* # of active jobs (listeners, sessions, open devices) */ +extern int unstoppable_jobs; /* # of active jobs that can't be stopped during a soft stop */ +extern int active_peers; /* # of active peers (connection attempts and successes) */ +extern int connected_peers; /* # of really connected peers */ +extern int nb_oldpids; /* contains the number of old pids found */ +extern const int zero; +extern const int one; +extern const struct linger nolinger; +extern int stopping; /* non zero means stopping in progress */ +extern int killed; /* >0 means a hard-stop is triggered, >1 means hard-stop immediately */ +extern char hostname[MAX_HOSTNAME_LEN]; +extern char *localpeer; +extern unsigned int warned; /* bitfield of a few warnings to emit just once */ +extern struct list proc_list; /* list of process in mworker mode */ +extern int master; /* 1 if in master, 0 otherwise */ +extern unsigned int rlim_fd_cur_at_boot; +extern unsigned int rlim_fd_max_at_boot; +extern int atexit_flag; +extern unsigned char boot_seed[20]; // per-boot random seed (160 bits initially) +extern THREAD_LOCAL struct buffer trash; + +struct proxy; +struct server; +int main(int argc, char **argv); +void deinit(void); +__attribute__((noreturn)) void deinit_and_exit(int); +void run_poll_loop(void); +int tell_old_pids(int sig); +int delete_oldpid(int pid); +void hap_register_build_opts(const char *str, int must_free); +void hap_register_feature(const char *name); +int split_version(const char *version, unsigned int *value); +int compare_current_version(const char *version); +void display_version(); + +void mworker_accept_wrapper(int fd); +void mworker_reload(int hardreload); + +/* to be used with warned and WARN_* */ +static inline int already_warned(unsigned int warning) +{ + if (warned & warning) + return 1; + warned |= warning; + return 0; +} + +extern unsigned int experimental_directives_allowed; + +struct cfg_keyword; +int check_kw_experimental(struct cfg_keyword *kw, const char *file, int linenum, + char **errmsg); +const char **hap_get_next_build_opt(const char **curr); + +/* simplified way to declare static build options in a file */ +#define REGISTER_BUILD_OPTS(str) \ + INITCALL2(STG_REGISTER, hap_register_build_opts, (str), 0) + +#endif /* _HAPROXY_GLOBAL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/h1.h b/include/haproxy/h1.h new file mode 100644 index 0000000..7152c6e --- /dev/null +++ b/include/haproxy/h1.h @@ -0,0 +1,377 @@ +/* + * include/haproxy/h1.h + * This file contains HTTP/1 protocol definitions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_H1_H +#define _HAPROXY_H1_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/http.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/intops.h> + + +/* Possible states while parsing HTTP/1 messages (request|response) */ +enum h1m_state { + H1_MSG_RQBEFORE = 0, // request: leading LF, before start line + H1_MSG_RQBEFORE_CR = 1, // request: leading CRLF, before start line + /* these ones define a request start line */ + H1_MSG_RQMETH = 2, // parsing the Method + H1_MSG_RQMETH_SP = 3, // space(s) after the Method + H1_MSG_RQURI = 4, // parsing the Request URI + H1_MSG_RQURI_SP = 5, // space(s) after the Request URI + H1_MSG_RQVER = 6, // parsing the Request Version + H1_MSG_RQLINE_END = 7, // end of request line (CR or LF) + + H1_MSG_RPBEFORE = 8, // response: leading LF, before start line + H1_MSG_RPBEFORE_CR = 9, // response: leading CRLF, before start line + + /* these ones define a response start line */ + H1_MSG_RPVER = 10, // parsing the Response Version + H1_MSG_RPVER_SP = 11, // space(s) after the Response Version + H1_MSG_RPCODE = 12, // response code + H1_MSG_RPCODE_SP = 13, // space(s) after the response code + H1_MSG_RPREASON = 14, // response reason + H1_MSG_RPLINE_END = 15, // end of response line (CR or LF) + + /* common header processing */ + H1_MSG_HDR_FIRST = 16, // waiting for first header or last CRLF (no LWS possible) + H1_MSG_HDR_NAME = 17, // parsing header name + H1_MSG_HDR_COL = 18, // parsing header colon + H1_MSG_HDR_L1_SP = 19, // parsing header LWS (SP|HT) before value + H1_MSG_HDR_L1_LF = 20, // parsing header LWS (LF) before value + H1_MSG_HDR_L1_LWS = 21, // checking whether it's a new header or an LWS + H1_MSG_HDR_VAL = 22, // parsing header value + H1_MSG_HDR_L2_LF = 23, // parsing header LWS (LF) inside/after value + H1_MSG_HDR_L2_LWS = 24, // checking whether it's a new header or an LWS + + H1_MSG_LAST_LF = 25, // parsing last LF, last state for headers + + /* Body processing. */ + + H1_MSG_CHUNK_SIZE = 26, // parsing the chunk size (RFC7230 #4.1) + H1_MSG_DATA = 27, // skipping data chunk / content-length data + H1_MSG_CHUNK_CRLF = 28, // skipping CRLF after data chunk + H1_MSG_TRAILERS = 29, // trailers (post-data entity headers) + /* we enter this state when we've received the end of the current message */ + H1_MSG_DONE = 30, // message end received, waiting for resync or close + H1_MSG_TUNNEL = 31, // tunneled data after DONE +} __attribute__((packed)); + + +/* HTTP/1 message flags (32 bit), for use in h1m->flags only */ +#define H1_MF_NONE 0x00000000 +#define H1_MF_CLEN 0x00000001 // content-length present +#define H1_MF_CHNK 0x00000002 // chunk present (as last encoding), exclusive with c-l +#define H1_MF_RESP 0x00000004 // this message is the response message +#define H1_MF_TOLOWER 0x00000008 // turn the header names to lower case +#define H1_MF_VER_11 0x00000010 // message indicates version 1.1 or above +#define H1_MF_CONN_CLO 0x00000020 // message contains "connection: close" +#define H1_MF_CONN_KAL 0x00000040 // message contains "connection: keep-alive" +#define H1_MF_CONN_UPG 0x00000080 // message contains "connection: upgrade" +#define H1_MF_XFER_LEN 0x00000100 // message xfer size can be determined +#define H1_MF_XFER_ENC 0x00000200 // transfer-encoding is present +#define H1_MF_NO_PHDR 0x00000400 // don't add pseudo-headers in the header list +#define H1_MF_HDRS_ONLY 0x00000800 // parse headers only +#define H1_MF_CLEAN_CONN_HDR 0x00001000 // skip close/keep-alive values of connection headers during parsing +#define H1_MF_METH_CONNECT 0x00002000 // Set for a response to a CONNECT request +#define H1_MF_METH_HEAD 0x00004000 // Set for a response to a HEAD request +#define H1_MF_UPG_WEBSOCKET 0x00008000 // Set for a Websocket upgrade handshake +#define H1_MF_TE_CHUNKED 0x00010000 // T-E "chunked" +#define H1_MF_TE_OTHER 0x00020000 // T-E other than supported ones found (only "chunked" is supported for now) + +/* Mask to use to reset H1M flags when we restart headers parsing. + * + * WARNING: Don't forget to update it if a new flag must be preserved when + * headers parsing is restarted. + */ +#define H1_MF_RESTART_MASK (H1_MF_RESP|H1_MF_TOLOWER|H1_MF_NO_PHDR|H1_MF_HDRS_ONLY| \ + H1_MF_CLEAN_CONN_HDR|H1_MF_METH_CONNECT|H1_MF_METH_HEAD) + +/* Note: for a connection to be persistent, we need this for the request : + * - one of CLEN or CHNK + * - version 1.0 and KAL and not CLO + * - or version 1.1 and not CLO + * For the response it's the same except that UPG must not appear either. + * So in short, for a request it's (CLEN|CHNK) > 0 && !CLO && (VER_11 || KAL) + * and for a response it's (CLEN|CHNK) > 0 && !(CLO|UPG) && (VER_11 || KAL) + */ + + +/* basic HTTP/1 message state for use in parsers. The err_pos field is special, + * it is pre-set to a negative value (-1 or -2), and once non-negative it contains + * the relative position in the message of the first parse error. -2 is used to tell + * the parser that we want to block the invalid message. -1 is used to only perform + * a silent capture. + */ +struct h1m { + enum h1m_state state; // H1 message state (H1_MSG_*) + /* 24 bits available here */ + uint32_t flags; // H1 message flags (H1_MF_*) + uint64_t curr_len; // content-length or last chunk length + uint64_t body_len; // total known size of the body length + uint32_t next; // next byte to parse, relative to buffer's head + int err_pos; // position in the byte stream of the first error (H1 or H2) + int err_state; // state where the first error was met (H1 or H2) +}; + +/* basic H1 start line, describes either the request and the response */ +union h1_sl { /* useful start line pointers, relative to ->sol */ + struct { + struct ist m; /* METHOD */ + struct ist u; /* URI */ + struct ist v; /* VERSION */ + enum http_meth_t meth; /* method */ + } rq; /* request line : field, length */ + struct { + struct ist v; /* VERSION */ + struct ist c; /* CODE */ + struct ist r; /* REASON */ + uint16_t status; /* status code */ + } st; /* status line : field, length */ +}; + +int h1_headers_to_hdr_list(char *start, const char *stop, + struct http_hdr *hdr, unsigned int hdr_num, + struct h1m *h1m, union h1_sl *slp); +int h1_measure_trailers(const struct buffer *buf, unsigned int ofs, unsigned int max); + +int h1_parse_cont_len_header(struct h1m *h1m, struct ist *value); +int h1_parse_xfer_enc_header(struct h1m *h1m, struct ist value); +void h1_parse_connection_header(struct h1m *h1m, struct ist *value); +void h1_parse_upgrade_header(struct h1m *h1m, struct ist value); + +void h1_generate_random_ws_input_key(char key_out[25]); +void h1_calculate_ws_output_key(const char *key, char *result); + +/* for debugging, reports the HTTP/1 message state name */ +static inline const char *h1m_state_str(enum h1m_state msg_state) +{ + switch (msg_state) { + case H1_MSG_RQBEFORE: return "MSG_RQBEFORE"; + case H1_MSG_RQBEFORE_CR: return "MSG_RQBEFORE_CR"; + case H1_MSG_RQMETH: return "MSG_RQMETH"; + case H1_MSG_RQMETH_SP: return "MSG_RQMETH_SP"; + case H1_MSG_RQURI: return "MSG_RQURI"; + case H1_MSG_RQURI_SP: return "MSG_RQURI_SP"; + case H1_MSG_RQVER: return "MSG_RQVER"; + case H1_MSG_RQLINE_END: return "MSG_RQLINE_END"; + case H1_MSG_RPBEFORE: return "MSG_RPBEFORE"; + case H1_MSG_RPBEFORE_CR: return "MSG_RPBEFORE_CR"; + case H1_MSG_RPVER: return "MSG_RPVER"; + case H1_MSG_RPVER_SP: return "MSG_RPVER_SP"; + case H1_MSG_RPCODE: return "MSG_RPCODE"; + case H1_MSG_RPCODE_SP: return "MSG_RPCODE_SP"; + case H1_MSG_RPREASON: return "MSG_RPREASON"; + case H1_MSG_RPLINE_END: return "MSG_RPLINE_END"; + case H1_MSG_HDR_FIRST: return "MSG_HDR_FIRST"; + case H1_MSG_HDR_NAME: return "MSG_HDR_NAME"; + case H1_MSG_HDR_COL: return "MSG_HDR_COL"; + case H1_MSG_HDR_L1_SP: return "MSG_HDR_L1_SP"; + case H1_MSG_HDR_L1_LF: return "MSG_HDR_L1_LF"; + case H1_MSG_HDR_L1_LWS: return "MSG_HDR_L1_LWS"; + case H1_MSG_HDR_VAL: return "MSG_HDR_VAL"; + case H1_MSG_HDR_L2_LF: return "MSG_HDR_L2_LF"; + case H1_MSG_HDR_L2_LWS: return "MSG_HDR_L2_LWS"; + case H1_MSG_LAST_LF: return "MSG_LAST_LF"; + case H1_MSG_CHUNK_SIZE: return "MSG_CHUNK_SIZE"; + case H1_MSG_DATA: return "MSG_DATA"; + case H1_MSG_CHUNK_CRLF: return "MSG_CHUNK_CRLF"; + case H1_MSG_TRAILERS: return "MSG_TRAILERS"; + case H1_MSG_DONE: return "MSG_DONE"; + case H1_MSG_TUNNEL: return "MSG_TUNNEL"; + default: return "MSG_??????"; + } +} + +/* This function may be called only in HTTP_MSG_CHUNK_CRLF. It reads the CRLF + * at the end of a chunk. The caller should adjust msg->next + * in order to include this part into the next forwarding phase. Note that the + * caller must ensure that head+start points to the first byte to parse. It + * returns the number of bytes parsed on success, so the caller can set msg_state + * to HTTP_MSG_CHUNK_SIZE. If not enough data are available, the function does not + * change anything and returns zero. Otherwise it returns a negative value + * indicating the error position relative to <stop>. Note: this function is + * designed to parse wrapped CRLF at the end of the buffer. + */ +static inline int h1_skip_chunk_crlf(const struct buffer *buf, int start, int stop) +{ + const char *ptr = b_peek(buf, start); + int bytes = 1; + + if (stop <= start) + return 0; + + if (unlikely(*ptr != '\r')) // negative position to stop + return ptr - __b_peek(buf, stop); + + /* NB: we'll check data availability at the end. It's not a + * problem because whatever we match first will be checked + * against the correct length. + */ + bytes++; + ptr++; + if (ptr >= b_wrap(buf)) + ptr = b_orig(buf); + + if (bytes > stop - start) + return 0; + + if (*ptr != '\n') // negative position to stop + return ptr - __b_peek(buf, stop); + + return bytes; +} + +/* Parse the chunk size start at buf + start and stops before buf + stop. The + * positions are relative to the buffer's head. + * It returns the chunk size in <res> and the amount of bytes read this way : + * < 0 : error at this position relative to <stop> + * = 0 : not enough bytes to read a complete chunk size + * > 0 : number of bytes successfully read that the caller can skip + * On success, the caller should adjust its msg->next to point to the first + * byte of data after the chunk size, so that we know we can forward exactly + * msg->next bytes, and msg->sol to contain the exact number of bytes forming + * the chunk size. That way it is always possible to differentiate between the + * start of the body and the start of the data. Note: this function is designed + * to parse wrapped CRLF at the end of the buffer. + */ +static inline int h1_parse_chunk_size(const struct buffer *buf, int start, int stop, uint64_t *res) +{ + const char *ptr = b_peek(buf, start); + const char *ptr_old = ptr; + const char *end = b_wrap(buf); + uint64_t chunk = 0; + + stop -= start; // bytes left + start = stop; // bytes to transfer + + /* The chunk size is in the following form, though we are only + * interested in the size and CRLF : + * 1*HEXDIGIT *WSP *[ ';' extensions ] CRLF + */ + while (1) { + int c; + if (!stop) + return 0; + c = hex2i(*ptr); + if (c < 0) /* not a hex digit anymore */ + break; + if (unlikely(++ptr >= end)) + ptr = b_orig(buf); + chunk = (chunk << 4) + c; + if (unlikely(chunk & 0xF0000000000000ULL)) { + /* Don't get more than 13 hexa-digit (2^52 - 1) to never fed possibly + * bogus values from languages that use floats for their integers + */ + goto error; + } + stop--; + } + + /* empty size not allowed */ + if (unlikely(ptr == ptr_old)) + goto error; + + while (HTTP_IS_SPHT(*ptr)) { + if (++ptr >= end) + ptr = b_orig(buf); + if (--stop == 0) + return 0; + } + + /* Up to there, we know that at least one byte is present at *ptr. Check + * for the end of chunk size. + */ + while (1) { + if (likely(*ptr == '\r')) { + /* we now have a CR, it must be followed by a LF */ + if (++ptr >= end) + ptr = b_orig(buf); + if (--stop == 0) + return 0; + + if (*ptr != '\n') + goto error; + if (++ptr >= end) + ptr = b_orig(buf); + --stop; + /* done */ + break; + } + else if (likely(*ptr == ';')) { + /* chunk extension, ends at next CRLF */ + if (++ptr >= end) + ptr = b_orig(buf); + if (--stop == 0) + return 0; + + while (!HTTP_IS_CRLF(*ptr)) { + if (++ptr >= end) + ptr = b_orig(buf); + if (--stop == 0) + return 0; + } + /* we have a CRLF now, loop above */ + continue; + } + else + goto error; + } + + /* OK we found our CRLF and now <ptr> points to the next byte, which may + * or may not be present. Let's return the number of bytes parsed. + */ + *res = chunk; + return start - stop; + error: + *res = 0; // just to stop gcc's -Wuninitialized warning :-( + return -stop; +} + +/* initializes an H1 message for a request */ +static inline struct h1m *h1m_init_req(struct h1m *h1m) +{ + h1m->state = H1_MSG_RQBEFORE; + h1m->next = 0; + h1m->flags = H1_MF_NONE; + h1m->curr_len = 0; + h1m->body_len = 0; + h1m->err_pos = -2; + h1m->err_state = 0; + return h1m; +} + +/* initializes an H1 message for a response */ +static inline struct h1m *h1m_init_res(struct h1m *h1m) +{ + h1m->state = H1_MSG_RPBEFORE; + h1m->next = 0; + h1m->flags = H1_MF_RESP; + h1m->curr_len = 0; + h1m->body_len = 0; + h1m->err_pos = -2; + h1m->err_state = 0; + return h1m; +} + +#endif /* _HAPROXY_H1_H */ diff --git a/include/haproxy/h1_htx.h b/include/haproxy/h1_htx.h new file mode 100644 index 0000000..61b96e0 --- /dev/null +++ b/include/haproxy/h1_htx.h @@ -0,0 +1,76 @@ +/* + * include/haproxy/h1_htx.h + * This file defines function prototypes for H1 manipulation using the + * internal representation. + * + * Copyright (C) 2019 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_H1_HTX_H +#define _HAPROXY_H1_HTX_H + +#include <import/ist.h> +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/h1.h> +#include <haproxy/htx.h> + +int h1_parse_msg_hdrs(struct h1m *h1m, union h1_sl *h1sl, struct htx *dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max); +size_t h1_parse_msg_data(struct h1m *h1m, struct htx **dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max, + struct buffer *htxbuf); +int h1_parse_msg_tlrs(struct h1m *h1m, struct htx *dsthtx, + struct buffer *srcbuf, size_t ofs, size_t max); + +/* Returns the URI of an HTX message in the most common format for a H1 peer. It + * is the path part of an absolute URI when the URI was normalized, ortherwise + * it is the whole URI, as received. Concretely, it is only a special case for + * URIs received from H2 clients, to be able to send a relative path the H1 + * servers. + */ +static inline struct ist h1_get_uri(const struct htx_sl *sl) +{ + struct ist uri; + + uri = htx_sl_req_uri(sl); + if (sl->flags & HTX_SL_F_NORMALIZED_URI) { + struct http_uri_parser parser = http_uri_parser_init(uri); + uri = http_parse_path(&parser); + if (unlikely(!uri.len)) { + if (sl->info.req.meth == HTTP_METH_OPTIONS) + uri = ist("*"); + else + uri = ist("/"); + } + } + return uri; +} + +int h1_format_htx_reqline(const struct htx_sl *sl, struct buffer *chk); +int h1_format_htx_stline(const struct htx_sl *sl, struct buffer *chk); +int h1_format_htx_hdr(const struct ist n, const struct ist v, struct buffer *chk); +int h1_format_htx_data(const struct ist data, struct buffer *chk, int chunked); + +#endif /* _HAPROXY_H1_HTX_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/h2.h b/include/haproxy/h2.h new file mode 100644 index 0000000..4082b38 --- /dev/null +++ b/include/haproxy/h2.h @@ -0,0 +1,351 @@ +/* + * include/haproxy/h2.h + * This file contains types and macros used for the HTTP/2 protocol + * + * Copyright (C) 2000-2017 Willy Tarreau - w@1wt.eu + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_H2_H +#define _HAPROXY_H2_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/htx-t.h> + +/* indexes of most important pseudo headers can be simplified to an almost + * linear array by dividing the index by 2 for all values from 1 to 9, and + * caping to 4 for values up to 14 ; thus it fits in a single 24-bit array + * shifted by 3 times the index value/2, or a 32-bit array shifted by 4x. + * Don't change these values, they are assumed by hpack_idx_to_phdr(). There + * is an entry for the Host header field which is not a pseudo-header but + * needs to be tracked as we should only use :authority if it's absent. + */ +enum { + H2_PHDR_IDX_NONE = 0, + H2_PHDR_IDX_AUTH = 1, /* :authority = 1 */ + H2_PHDR_IDX_METH = 2, /* :method = 2..3 */ + H2_PHDR_IDX_PATH = 3, /* :path = 4..5 */ + H2_PHDR_IDX_SCHM = 4, /* :scheme = 6..7 */ + H2_PHDR_IDX_STAT = 5, /* :status = 8..14 */ + H2_PHDR_IDX_HOST = 6, /* Host, never returned, just a place-holder */ + H2_PHDR_IDX_PROT = 7, /* :protocol from rfc 8441 Extended Connect */ + H2_PHDR_NUM_ENTRIES /* must be last */ +}; + +/* bit fields indicating the pseudo-headers found. It also covers the HOST + * header field as well as any non-pseudo-header field (NONE). + */ +enum { + H2_PHDR_FND_NONE = 1 << H2_PHDR_IDX_NONE, /* found a regular header */ + H2_PHDR_FND_AUTH = 1 << H2_PHDR_IDX_AUTH, + H2_PHDR_FND_METH = 1 << H2_PHDR_IDX_METH, + H2_PHDR_FND_PATH = 1 << H2_PHDR_IDX_PATH, + H2_PHDR_FND_SCHM = 1 << H2_PHDR_IDX_SCHM, + H2_PHDR_FND_STAT = 1 << H2_PHDR_IDX_STAT, + H2_PHDR_FND_HOST = 1 << H2_PHDR_IDX_HOST, + H2_PHDR_FND_PROT = 1 << H2_PHDR_IDX_PROT, +}; + +/* frame types, from the standard */ +enum h2_ft { + H2_FT_DATA = 0x00, // RFC7540 #6.1 + H2_FT_HEADERS = 0x01, // RFC7540 #6.2 + H2_FT_PRIORITY = 0x02, // RFC7540 #6.3 + H2_FT_RST_STREAM = 0x03, // RFC7540 #6.4 + H2_FT_SETTINGS = 0x04, // RFC7540 #6.5 + H2_FT_PUSH_PROMISE = 0x05, // RFC7540 #6.6 + H2_FT_PING = 0x06, // RFC7540 #6.7 + H2_FT_GOAWAY = 0x07, // RFC7540 #6.8 + H2_FT_WINDOW_UPDATE = 0x08, // RFC7540 #6.9 + H2_FT_CONTINUATION = 0x09, // RFC7540 #6.10 + H2_FT_ENTRIES /* must be last */ +} __attribute__((packed)); + +/* frame types, turned to bits or bit fields */ +enum { + /* one bit per frame type */ + H2_FT_DATA_BIT = 1U << H2_FT_DATA, + H2_FT_HEADERS_BIT = 1U << H2_FT_HEADERS, + H2_FT_PRIORITY_BIT = 1U << H2_FT_PRIORITY, + H2_FT_RST_STREAM_BIT = 1U << H2_FT_RST_STREAM, + H2_FT_SETTINGS_BIT = 1U << H2_FT_SETTINGS, + H2_FT_PUSH_PROMISE_BIT = 1U << H2_FT_PUSH_PROMISE, + H2_FT_PING_BIT = 1U << H2_FT_PING, + H2_FT_GOAWAY_BIT = 1U << H2_FT_GOAWAY, + H2_FT_WINDOW_UPDATE_BIT = 1U << H2_FT_WINDOW_UPDATE, + H2_FT_CONTINUATION_BIT = 1U << H2_FT_CONTINUATION, + /* padded frames */ + H2_FT_PADDED_MASK = H2_FT_DATA_BIT | H2_FT_HEADERS_BIT | H2_FT_PUSH_PROMISE_BIT, + /* flow controlled frames */ + H2_FT_FC_MASK = H2_FT_DATA_BIT, + /* header frames */ + H2_FT_HDR_MASK = H2_FT_HEADERS_BIT | H2_FT_PUSH_PROMISE_BIT | H2_FT_CONTINUATION_BIT, + /* frames allowed to arrive late on a stream */ + H2_FT_LATE_MASK = H2_FT_WINDOW_UPDATE_BIT | H2_FT_RST_STREAM_BIT | H2_FT_PRIORITY_BIT, +}; + + +/* flags defined for each frame type */ + +// RFC7540 #6.1 +#define H2_F_DATA_END_STREAM 0x01 +#define H2_F_DATA_PADDED 0x08 + +// RFC7540 #6.2 +#define H2_F_HEADERS_END_STREAM 0x01 +#define H2_F_HEADERS_END_HEADERS 0x04 +#define H2_F_HEADERS_PADDED 0x08 +#define H2_F_HEADERS_PRIORITY 0x20 + +// RFC7540 #6.3 : PRIORITY defines no flags +// RFC7540 #6.4 : RST_STREAM defines no flags + +// RFC7540 #6.5 +#define H2_F_SETTINGS_ACK 0x01 + +// RFC7540 #6.6 +#define H2_F_PUSH_PROMISE_END_HEADERS 0x04 +#define H2_F_PUSH_PROMISE_PADDED 0x08 + +// RFC7540 #6.7 +#define H2_F_PING_ACK 0x01 + +// RFC7540 #6.8 : GOAWAY defines no flags +// RFC7540 #6.9 : WINDOW_UPDATE defines no flags + +// PADDED is the exact same among DATA, HEADERS and PUSH_PROMISE (8) +#define H2_F_PADDED 0x08 + +/* HTTP/2 error codes - RFC7540 #7 */ +enum h2_err { + H2_ERR_NO_ERROR = 0x0, + H2_ERR_PROTOCOL_ERROR = 0x1, + H2_ERR_INTERNAL_ERROR = 0x2, + H2_ERR_FLOW_CONTROL_ERROR = 0x3, + H2_ERR_SETTINGS_TIMEOUT = 0x4, + H2_ERR_STREAM_CLOSED = 0x5, + H2_ERR_FRAME_SIZE_ERROR = 0x6, + H2_ERR_REFUSED_STREAM = 0x7, + H2_ERR_CANCEL = 0x8, + H2_ERR_COMPRESSION_ERROR = 0x9, + H2_ERR_CONNECT_ERROR = 0xa, + H2_ERR_ENHANCE_YOUR_CALM = 0xb, + H2_ERR_INADEQUATE_SECURITY = 0xc, + H2_ERR_HTTP_1_1_REQUIRED = 0xd, +} __attribute__((packed)); + +// RFC7540 #11.3 : Settings Registry +#define H2_SETTINGS_HEADER_TABLE_SIZE 0x0001 +#define H2_SETTINGS_ENABLE_PUSH 0x0002 +#define H2_SETTINGS_MAX_CONCURRENT_STREAMS 0x0003 +#define H2_SETTINGS_INITIAL_WINDOW_SIZE 0x0004 +#define H2_SETTINGS_MAX_FRAME_SIZE 0x0005 +#define H2_SETTINGS_MAX_HEADER_LIST_SIZE 0x0006 +#define H2_SETTINGS_ENABLE_CONNECT_PROTOCOL 0x0008 + + +/* some protocol constants */ + +// PRI * HTTP/2.0\r\n\r\nSM\r\n\r\n +#define H2_CONN_PREFACE \ + "\x50\x52\x49\x20\x2a\x20\x48\x54" \ + "\x54\x50\x2f\x32\x2e\x30\x0d\x0a" \ + "\x0d\x0a\x53\x4d\x0d\x0a\x0d\x0a" + + +/* some flags related to protocol parsing */ +#define H2_MSGF_BODY 0x0001 // a body is present +#define H2_MSGF_BODY_CL 0x0002 // content-length is present +#define H2_MSGF_BODY_TUNNEL 0x0004 // a tunnel is in use (CONNECT) +#define H2_MSGF_RSP_1XX 0x0010 // a 1xx ( != 101) HEADERS frame was received +#define H2_MSGF_BODYLESS_RSP 0x0020 // response message is known to have no body + // (response to HEAD request or 204/304 response) +#define H2_MSGF_EXT_CONNECT 0x0040 // Extended CONNECT method from rfc 8441 + +#define H2_MAX_STREAM_ID ((1U << 31) - 1) +#define H2_MAX_FRAME_LEN ((1U << 24) - 1) +#define H2_DIR_REQ 1 +#define H2_DIR_RES 2 +#define H2_DIR_BOTH 3 + +/* constraints imposed by the protocol on each frame type, in terms of stream + * ID values, frame sizes, and direction so that most connection-level checks + * can be centralized regardless of the frame's acceptance. + */ +struct h2_frame_definition { + int32_t dir; /* 0=none, 1=request, 2=response, 3=both */ + int32_t min_id; /* minimum allowed stream ID */ + int32_t max_id; /* maximum allowed stream ID */ + int32_t min_len; /* minimum frame length */ + int32_t max_len; /* maximum frame length */ +}; + +extern struct h2_frame_definition h2_frame_definition[H2_FT_ENTRIES]; + +/* various protocol processing functions */ + +int h2_parse_cont_len_header(unsigned int *msgf, struct ist *value, unsigned long long *body_len); +int h2_make_htx_request(struct http_hdr *list, struct htx *htx, unsigned int *msgf, unsigned long long *body_len, int relaxed); +int h2_make_htx_response(struct http_hdr *list, struct htx *htx, unsigned int *msgf, unsigned long long *body_len, char *upgrade_protocol); +int h2_make_htx_trailers(struct http_hdr *list, struct htx *htx); + +/* + * Some helpful debugging functions. + */ + +/* returns a bit corresponding to the frame type */ +static inline unsigned int h2_ft_bit(enum h2_ft ft) +{ + if (ft >= H2_FT_ENTRIES) + return 0; + return 1U << ft; +} + +/* returns the frame type as a string */ +static inline const char *h2_ft_str(int type) +{ + switch (type) { + case H2_FT_DATA : return "DATA"; + case H2_FT_HEADERS : return "HEADERS"; + case H2_FT_PRIORITY : return "PRIORITY"; + case H2_FT_RST_STREAM : return "RST_STREAM"; + case H2_FT_SETTINGS : return "SETTINGS"; + case H2_FT_PUSH_PROMISE : return "PUSH_PROMISE"; + case H2_FT_PING : return "PING"; + case H2_FT_GOAWAY : return "GOAWAY"; + case H2_FT_WINDOW_UPDATE : return "WINDOW_UPDATE"; + default : return "_UNKNOWN_"; + } +} + +/* returns the error code as a string */ +static inline const char *h2_err_str(enum h2_err err) +{ + switch (err) { + case H2_ERR_NO_ERROR : return "NO_ERROR"; + case H2_ERR_PROTOCOL_ERROR : return "PROTOCOL_ERROR"; + case H2_ERR_INTERNAL_ERROR : return "INTERNAL_ERROR"; + case H2_ERR_FLOW_CONTROL_ERROR : return "FLOW_CONTROL_ERROR"; + case H2_ERR_SETTINGS_TIMEOUT : return "SETTINGS_TIMEOUT"; + case H2_ERR_STREAM_CLOSED : return "STREAM_CLOSED"; + case H2_ERR_FRAME_SIZE_ERROR : return "FRAME_SIZE_ERROR"; + case H2_ERR_REFUSED_STREAM : return "REFUSED_STREAM"; + case H2_ERR_CANCEL : return "CANCEL"; + case H2_ERR_COMPRESSION_ERROR : return "COMPRESSION_ERROR"; + case H2_ERR_CONNECT_ERROR : return "CONNECT_ERROR"; + case H2_ERR_ENHANCE_YOUR_CALM : return "ENHANCE_YOUR_CALM"; + case H2_ERR_INADEQUATE_SECURITY : return "INADEQUATE_SECURITY"; + case H2_ERR_HTTP_1_1_REQUIRED : return "HTTP_1_1_REQUIRED"; + default : return "_UNKNOWN_"; + } +} + +/* Returns an error code if the frame is valid protocol-wise, otherwise 0. <ft> + * is the frame type (H2_FT_*), <dir> is the direction (1=req, 2=res), <id> is + * the stream ID from the frame header, <len> is the frame length from the + * header. The purpose is to be able to quickly return a PROTOCOL_ERROR or + * FRAME_SIZE_ERROR connection error even for situations where the frame will + * be ignored. <mfs> must be the max frame size currently in place for the + * protocol. + */ +static inline int h2_frame_check(enum h2_ft ft, int dir, int32_t id, int32_t len, int32_t mfs) +{ + struct h2_frame_definition *fd; + + if (ft >= H2_FT_ENTRIES) + return H2_ERR_NO_ERROR; // ignore unhandled frame types + + fd = &h2_frame_definition[ft]; + + if (!(dir & fd->dir)) + return H2_ERR_PROTOCOL_ERROR; + + if (id < fd->min_id || id > fd->max_id) + return H2_ERR_PROTOCOL_ERROR; + + if (len < fd->min_len || len > fd->max_len) + return H2_ERR_FRAME_SIZE_ERROR; + + if (len > mfs) + return H2_ERR_FRAME_SIZE_ERROR; + + if (ft == H2_FT_SETTINGS && (len % 6) != 0) + return H2_ERR_FRAME_SIZE_ERROR; // RFC7540#6.5 + + return H2_ERR_NO_ERROR; +} + +/* returns the pseudo-header <str> corresponds to among H2_PHDR_IDX_*, 0 if not a + * pseudo-header, or -1 if not a valid pseudo-header. + */ +static inline int h2_str_to_phdr(const struct ist str) +{ + if (*str.ptr == ':') { + if (isteq(str, ist(":path"))) return H2_PHDR_IDX_PATH; + else if (isteq(str, ist(":method"))) return H2_PHDR_IDX_METH; + else if (isteq(str, ist(":scheme"))) return H2_PHDR_IDX_SCHM; + else if (isteq(str, ist(":status"))) return H2_PHDR_IDX_STAT; + else if (isteq(str, ist(":authority"))) return H2_PHDR_IDX_AUTH; + else if (isteq(str, ist(":protocol"))) return H2_PHDR_IDX_PROT; + + /* all other names starting with ':' */ + return -1; + } + + /* not a pseudo header */ + return 0; +} + +/* returns the pseudo-header name <num> as an ist, or ":UNKNOWN" if unknown. + * Note that all strings are zero-terminated constants. + */ +static inline struct ist h2_phdr_to_ist(int phdr) +{ + switch (phdr) { + case H2_PHDR_IDX_NONE: return ist(":NONE"); + case H2_PHDR_IDX_AUTH: return ist(":authority"); + case H2_PHDR_IDX_METH: return ist(":method"); + case H2_PHDR_IDX_PATH: return ist(":path"); + case H2_PHDR_IDX_SCHM: return ist(":scheme"); + case H2_PHDR_IDX_STAT: return ist(":status"); + case H2_PHDR_IDX_HOST: return ist("Host"); + default: return ist(":UNKNOWN"); + } +} + +/* returns the pseudo-header name <num> as a string, or ":UNKNOWN" if unknown */ +static inline const char *h2_phdr_to_str(int phdr) +{ + return h2_phdr_to_ist(phdr).ptr; +} + +#endif /* _HAPROXY_H2_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/h3.h b/include/haproxy/h3.h new file mode 100644 index 0000000..1bedf43 --- /dev/null +++ b/include/haproxy/h3.h @@ -0,0 +1,118 @@ +/* + * include/haproxy/h3.h + * This file contains types for H3 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_H3_T_H +#define _HAPROXY_H3_T_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/buf-t.h> +#include <haproxy/mux_quic-t.h> + +/* H3 unidirecational stream types + * Emitted as the first byte on the stream to differentiate it. + */ +#define H3_UNI_S_T_CTRL 0x00 +#define H3_UNI_S_T_PUSH 0x01 +#define H3_UNI_S_T_QPACK_ENC 0x02 +#define H3_UNI_S_T_QPACK_DEC 0x03 +/* Must be the last one */ +#define H3_UNI_S_T_MAX H3_UNI_S_T_QPACK_DEC + +/* Settings */ +#define H3_SETTINGS_RESERVED_0 0x00 +#define H3_SETTINGS_QPACK_MAX_TABLE_CAPACITY 0x01 +/* there is a hole here of reserved settings, matching the h2 settings */ +#define H3_SETTINGS_RESERVED_2 0x02 +#define H3_SETTINGS_RESERVED_3 0x03 +#define H3_SETTINGS_RESERVED_4 0x04 +#define H3_SETTINGS_RESERVED_5 0x05 +#define H3_SETTINGS_MAX_FIELD_SECTION_SIZE 0x06 +#define H3_SETTINGS_QPACK_BLOCKED_STREAMS 0x07 + +/* Errors. */ +enum h3_err { + H3_NO_ERROR = 0x100, + H3_GENERAL_PROTOCOL_ERROR = 0x101, + H3_INTERNAL_ERROR = 0x102, + H3_STREAM_CREATION_ERROR = 0x103, + H3_CLOSED_CRITICAL_STREAM = 0x104, + H3_FRAME_UNEXPECTED = 0x105, + H3_FRAME_ERROR = 0x106, + H3_EXCESSIVE_LOAD = 0x107, + H3_ID_ERROR = 0x108, + H3_SETTINGS_ERROR = 0x109, + H3_MISSING_SETTINGS = 0x10a, + H3_REQUEST_REJECTED = 0x10b, + H3_REQUEST_CANCELLED = 0x10c, + H3_REQUEST_INCOMPLETE = 0x10d, + H3_MESSAGE_ERROR = 0x10e, + H3_CONNECT_ERROR = 0x10f, + H3_VERSION_FALLBACK = 0x110, + + QPACK_DECOMPRESSION_FAILED = 0x200, + QPACK_ENCODER_STREAM_ERROR = 0x201, + QPACK_DECODER_STREAM_ERROR = 0x202, +}; + +/* Frame types. */ +enum h3_ft { + /* internal value used to mark demuxing as inactive */ + H3_FT_UNINIT = -1, + + H3_FT_DATA = 0x00, + H3_FT_HEADERS = 0x01, + /* hole */ + H3_FT_CANCEL_PUSH = 0x03, + H3_FT_SETTINGS = 0x04, + H3_FT_PUSH_PROMISE = 0x05, + /* hole */ + H3_FT_GOAWAY = 0x07, + /* hole */ + H3_FT_MAX_PUSH_ID = 0x0d, +}; + +/* Stream types */ +enum h3s_t { + /* unidirectional streams */ + H3S_T_CTRL, + H3S_T_PUSH, + H3S_T_QPACK_DEC, + H3S_T_QPACK_ENC, + + /* bidirectional streams */ + H3S_T_REQ, + + H3S_T_UNKNOWN +}; + +/* State for request streams */ +enum h3s_st_req { + H3S_ST_REQ_BEFORE = 0, /* initial state */ + H3S_ST_REQ_HEADERS, /* header section received */ + H3S_ST_REQ_DATA, /* first DATA frame for content received */ + H3S_ST_REQ_TRAILERS, /* trailer section received */ +}; + +extern const struct qcc_app_ops h3_ops; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_H3_T_H */ diff --git a/include/haproxy/h3_stats-t.h b/include/haproxy/h3_stats-t.h new file mode 100644 index 0000000..3c00f6c --- /dev/null +++ b/include/haproxy/h3_stats-t.h @@ -0,0 +1,12 @@ +#ifndef _HAPROXY_H3_STATS_T_H +#define _HAPROXY_H3_STATS_T_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +extern struct stats_module h3_stats_module; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_H3_STATS_T_H */ diff --git a/include/haproxy/h3_stats.h b/include/haproxy/h3_stats.h new file mode 100644 index 0000000..ed7c5e7 --- /dev/null +++ b/include/haproxy/h3_stats.h @@ -0,0 +1,17 @@ +#ifndef _HAPROXY_H3_STATS_H +#define _HAPROXY_H3_STATS_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/h3_stats-t.h> + +struct h3_counters; + +void h3_inc_err_cnt(void *ctx, int error_code); +void h3_inc_frame_type_cnt(struct h3_counters *ctrs, int frm_type); + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_H3_STATS_H */ diff --git a/include/haproxy/hash.h b/include/haproxy/hash.h new file mode 100644 index 0000000..cb506c7 --- /dev/null +++ b/include/haproxy/hash.h @@ -0,0 +1,33 @@ +/* + * include/haproxy/hash.h + * Macros for different hashing function. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HASH_H_ +#define _HAPROXY_HASH_H_ + +#include <inttypes.h> + +unsigned int hash_djb2(const void *input, int len); +unsigned int hash_wt6(const void *input, int len); +unsigned int hash_sdbm(const void *input, int len); +unsigned int hash_crc32(const void *input, int len); +uint32_t hash_crc32c(const void *input, int len); + +#endif /* _HAPROXY_HASH_H_ */ diff --git a/include/haproxy/hlua-t.h b/include/haproxy/hlua-t.h new file mode 100644 index 0000000..2672ffd --- /dev/null +++ b/include/haproxy/hlua-t.h @@ -0,0 +1,243 @@ +/* + * include/haproxy/hlua-t.h + * Lua core types definitions + * + * Copyright (C) 2015-2016 Thierry Fournier <tfournier@arpalert.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HLUA_T_H +#define _HAPROXY_HLUA_T_H + +#ifdef USE_LUA + +#include <lua.h> +#include <lauxlib.h> +#include <stdint.h> + +#include <import/ebtree-t.h> + +#include <haproxy/proxy-t.h> +#include <haproxy/regex-t.h> +#include <haproxy/server-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/xref-t.h> +#include <haproxy/event_hdl-t.h> + +#define CLASS_CORE "Core" +#define CLASS_TXN "TXN" +#define CLASS_FETCHES "Fetches" +#define CLASS_CONVERTERS "Converters" +#define CLASS_SOCKET "Socket" +#define CLASS_CHANNEL "Channel" +#define CLASS_HTTP "HTTP" +#define CLASS_HTTP_MSG "HTTPMessage" +#define CLASS_HTTPCLIENT "HTTPClient" +#define CLASS_MAP "Map" +#define CLASS_APPLET_TCP "AppletTCP" +#define CLASS_APPLET_HTTP "AppletHTTP" +#define CLASS_PROXY "Proxy" +#define CLASS_SERVER "Server" +#define CLASS_LISTENER "Listener" +#define CLASS_EVENT_SUB "EventSub" +#define CLASS_REGEX "Regex" +#define CLASS_STKTABLE "StickTable" +#define CLASS_CERTCACHE "CertCache" +#define CLASS_PROXY_LIST "ProxyList" +#define CLASS_SERVER_LIST "ServerList" + +struct stream; + +#define HLUA_RUN 0x00000001 +#define HLUA_CTRLYIELD 0x00000002 +#define HLUA_WAKERESWR 0x00000004 +#define HLUA_WAKEREQWR 0x00000008 +#define HLUA_EXIT 0x00000010 +#define HLUA_NOYIELD 0x00000020 + +#define HLUA_F_AS_STRING 0x01 +#define HLUA_F_MAY_USE_HTTP 0x02 + +/* HLUA TXN flags */ +#define HLUA_TXN_NOTERM 0x00000001 +/* 0x00000002 .. 0x00000008 unused */ + +/* The execution context (enum), bits values from 0x00000010 to + * 0x00000030. These flags are mutually exclusives. Only one must be set at a + * time. + */ +#define HLUA_TXN_SMP_NONE 0x00000000 /* No specific execution context */ +#define HLUA_TXN_SMP_CTX 0x00000010 /* Executed from a sample fecth context */ +#define HLUA_TXN_ACT_CTX 0x00000020 /* Executed from a action context */ +#define HLUA_TXN_FLT_CTX 0x00000030 /* Executed from a filter context */ +#define HLUA_TXN_CTX_MASK 0x00000030 /* Mask to get the execution context */ + + +#define HLUA_CONCAT_BLOCSZ 2048 + +enum hlua_exec { + HLUA_E_OK = 0, + HLUA_E_AGAIN, /* LUA yield, must resume the stack execution later, when + the associatedtask is waked. */ + HLUA_E_ETMOUT, /* Execution timeout */ + HLUA_E_NOMEM, /* Out of memory error */ + HLUA_E_YIELD, /* LUA code try to yield, and this is not allowed */ + HLUA_E_ERRMSG, /* LUA stack execution failed with a string error message + in the top of stack. */ + HLUA_E_ERR, /* LUA stack execution failed without error message. */ +}; + +struct hlua_timer { + uint32_t start; /* cpu time in ms when the timer was started */ + uint32_t burst; /* execution time for the current call in ms */ + uint32_t cumulative; /* cumulative execution time for the coroutine in ms */ + uint32_t max; /* max (cumulative) execution time for the coroutine in ms */ +}; + +struct hlua { + lua_State *T; /* The LUA stack. */ + int state_id; /* contains the lua state id. 0 is common state, 1 to n are per-thread states.*/ + int Tref; /* The reference of the stack in coroutine case. + -1 for the main lua stack. */ + int Mref; /* The reference of the memory context in coroutine case. + -1 if the memory context is not used. */ + int nargs; /* The number of arguments in the stack at the start of execution. */ + unsigned int flags; /* The current execution flags. */ + int wake_time; /* The lua wants to be waked at this time, or before. (ticks) */ + struct hlua_timer timer; /* lua multipurpose timer */ + struct task *task; /* The task associated with the lua stack execution. + We must wake this task to continue the task execution */ + struct list com; /* The list head of the signals attached to this task. */ + struct mt_list hc_list; /* list of httpclient associated to this lua task */ + struct ebpt_node node; + int gc_count; /* number of items which need a GC */ +}; + +/* This is a part of the list containing references to functions + * called at the initialisation time. + */ +struct hlua_init_function { + struct list l; + int function_ref; +}; + +/* This struct contains the lua data used to bind + * Lua function on HAProxy hook like sample-fetches + * or actions. + */ +struct hlua_function { + struct list l; + char *name; + int function_ref[MAX_THREADS + 1]; + int nargs; +}; + +/* This struct is used with the structs: + * - http_req_rule + * - http_res_rule + * - tcp_rule + * It contains the lua execution configuration. + */ +struct hlua_rule { + struct hlua_function *fcn; + char **args; +}; + +/* This struct contains the pointer provided on the most + * of internal HAProxy calls during the processing of + * rules, converters and sample-fetches. This struct is + * associated with the lua object called "TXN". + */ +struct hlua_txn { + struct stream *s; + struct proxy *p; + int dir; /* SMP_OPT_DIR_{REQ,RES} */ + int flags; +}; + +/* This struct contains the applet context. */ +struct hlua_appctx { + struct appctx *appctx; + luaL_Buffer b; /* buffer used to prepare strings. */ + struct hlua_txn htxn; +}; + +/* This struct is used with sample fetches and sample converters. */ +struct hlua_smp { + struct stream *s; + struct proxy *p; + unsigned int flags; /* LUA_F_OPT_* */ + int dir; /* SMP_OPT_DIR_{REQ,RES} */ +}; + +/* This struct contains data used with sleep functions. */ +struct hlua_sleep { + struct task *task; /* task associated with sleep. */ + struct list com; /* list of signal to wake at the end of sleep. */ + unsigned int wakeup_ms; /* hour to wakeup. */ +}; + +/* This struct is used to create coprocess doing TCP or + * SSL I/O. It uses a fake stream. + */ +struct hlua_socket { + struct xref xref; /* cross reference with the stream used for socket I/O. */ + luaL_Buffer b; /* buffer used to prepare strings. */ + unsigned long tid; /* Store the thread id which creates the socket. */ +}; + +struct hlua_concat { + int size; + int len; +}; + +/* This struct is used to store the httpclient */ +struct hlua_httpclient { + struct httpclient *hc; /* ptr to the httpclient instance */ + size_t sent; /* payload sent */ + luaL_Buffer b; /* buffer used to prepare strings. */ + struct mt_list by_hlua; /* linked in the current hlua task */ +}; + +struct hlua_proxy_list { + char capabilities; +}; + +struct hlua_proxy_list_iterator_context { + struct proxy *next; + char capabilities; +}; + +struct hlua_server_list { + struct proxy *px; +}; + +struct hlua_server_list_iterator_context { + struct server *cur; + struct proxy *px; +}; + +#else /* USE_LUA */ +/************************ For use when Lua is disabled ********************/ + +/* Empty struct for compilation compatibility */ +struct hlua { }; +struct hlua_socket { }; +struct hlua_rule { }; + +#endif /* USE_LUA */ + +#endif /* _HAPROXY_HLUA_T_H */ diff --git a/include/haproxy/hlua.h b/include/haproxy/hlua.h new file mode 100644 index 0000000..3c67cce --- /dev/null +++ b/include/haproxy/hlua.h @@ -0,0 +1,81 @@ +/* + * include/haproxy/hlua.h + * Lua core management functions + * + * Copyright (C) 2015-2016 Thierry Fournier <tfournier@arpalert.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HLUA_H +#define _HAPROXY_HLUA_H + +#include <haproxy/hlua-t.h> + +#ifdef USE_LUA + +/* The following macros are used to set flags. */ +#define HLUA_SET_RUN(__hlua) do {(__hlua)->flags |= HLUA_RUN;} while(0) +#define HLUA_CLR_RUN(__hlua) do {(__hlua)->flags &= ~HLUA_RUN;} while(0) +#define HLUA_IS_RUNNING(__hlua) ((__hlua)->flags & HLUA_RUN) +#define HLUA_SET_CTRLYIELD(__hlua) do {(__hlua)->flags |= HLUA_CTRLYIELD;} while(0) +#define HLUA_CLR_CTRLYIELD(__hlua) do {(__hlua)->flags &= ~HLUA_CTRLYIELD;} while(0) +#define HLUA_IS_CTRLYIELDING(__hlua) ((__hlua)->flags & HLUA_CTRLYIELD) +#define HLUA_SET_WAKERESWR(__hlua) do {(__hlua)->flags |= HLUA_WAKERESWR;} while(0) +#define HLUA_CLR_WAKERESWR(__hlua) do {(__hlua)->flags &= ~HLUA_WAKERESWR;} while(0) +#define HLUA_IS_WAKERESWR(__hlua) ((__hlua)->flags & HLUA_WAKERESWR) +#define HLUA_SET_WAKEREQWR(__hlua) do {(__hlua)->flags |= HLUA_WAKEREQWR;} while(0) +#define HLUA_CLR_WAKEREQWR(__hlua) do {(__hlua)->flags &= ~HLUA_WAKEREQWR;} while(0) +#define HLUA_IS_WAKEREQWR(__hlua) ((__hlua)->flags & HLUA_WAKEREQWR) +#define HLUA_CLR_NOYIELD(__hlua) do {(__hlua)->flags &= ~HLUA_NOYIELD;} while(0) +#define HLUA_SET_NOYIELD(__hlua) do {(__hlua)->flags |= HLUA_NOYIELD;} while(0) +#define HLUA_CANT_YIELD(__hlua) ((__hlua)->flags & HLUA_NOYIELD) + + +#define HLUA_INIT(__hlua) do { (__hlua)->T = 0; } while(0) + +/* Lua HAProxy integration functions. */ +const char *hlua_traceback(lua_State *L, const char* sep); +void hlua_ctx_destroy(struct hlua *lua); +void hlua_init(); +int hlua_post_init(); +void hlua_applet_tcp_fct(struct appctx *ctx); +void hlua_applet_http_fct(struct appctx *ctx); +int hlua_event_sub(lua_State *L, event_hdl_sub_list *sub_list); +struct task *hlua_process_task(struct task *task, void *context, unsigned int state); +const char *hlua_show_current_location(const char *pfx); +int hlua_ref(lua_State *L); +void hlua_pushref(lua_State *L, int ref); +void hlua_unref(lua_State *L, int ref); +struct hlua *hlua_gethlua(lua_State *L); +void hlua_yieldk(lua_State *L, int nresults, lua_KContext ctx, lua_KFunction k, int timeout, unsigned int flags); + +#else /* USE_LUA */ + +/************************ For use when Lua is disabled ********************/ + +#define HLUA_IS_RUNNING(__hlua) 0 + +#define HLUA_INIT(__hlua) + +/* Empty function for compilation without Lua. */ +static inline void hlua_init() { } +static inline int hlua_post_init() { return 1; } +static inline void hlua_ctx_destroy(struct hlua *lua) { } +static inline const char *hlua_show_current_location(const char *pfx) { return NULL; } + +#endif /* USE_LUA */ + +#endif /* _HAPROXY_HLUA_H */ diff --git a/include/haproxy/hlua_fcn.h b/include/haproxy/hlua_fcn.h new file mode 100644 index 0000000..ff9250a --- /dev/null +++ b/include/haproxy/hlua_fcn.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/hlua_fcn.h + * Lua user-level management functions + * + * Copyright (C) 2015-2016 Thierry Fournier <tfournier@arpalert.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HLUA_FCN_H +#define _HAPROXY_HLUA_FCN_H + +#include <lua.h> +#include <haproxy/hlua-t.h> + +int hlua_checkboolean(lua_State *L, int index); + +void hlua_class_const_int(lua_State *L, const char *name, int value); +void hlua_class_const_str(lua_State *L, const char *name, const char *value); +void hlua_class_function(lua_State *L, const char *name, int (*function)(lua_State *L)); +void *hlua_checkudata(lua_State *L, int ud, int class_ref); +int hlua_register_metatable(struct lua_State *L, char *name); +void hlua_fcn_reg_core_fcn(lua_State *L); +int hlua_dump_object(lua_State *L); +int hlua_fcn_new_proxy(lua_State *L, struct proxy *px); +int hlua_fcn_new_server(lua_State *L, struct server *srv); +int hlua_fcn_new_event_sub(lua_State *L, struct event_hdl_sub *sub); + +#endif /* _HAPROXY_HLUA_FCN_H */ diff --git a/include/haproxy/hpack-dec.h b/include/haproxy/hpack-dec.h new file mode 100644 index 0000000..4fb1a36 --- /dev/null +++ b/include/haproxy/hpack-dec.h @@ -0,0 +1,39 @@ +/* + * HPACK decompressor (RFC7541) + * + * Copyright (C) 2014-2020 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _COMMON_HPACK_DEC_H +#define _COMMON_HPACK_DEC_H + +#include <haproxy/api.h> +#include <haproxy/chunk.h> +#include <haproxy/hpack-tbl.h> + +int hpack_decode_frame(struct hpack_dht *dht, const uint8_t *raw, uint32_t len, + struct http_hdr *list, int list_size, + struct buffer *tmp); + +#endif /* _COMMON_HPACK_DEC_H */ diff --git a/include/haproxy/hpack-enc.h b/include/haproxy/hpack-enc.h new file mode 100644 index 0000000..7511c5d --- /dev/null +++ b/include/haproxy/hpack-enc.h @@ -0,0 +1,261 @@ +/* + * HPACK compressor (RFC7541) + * + * Copyright (C) 2014-2020 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _COMMON_HPACK_ENC_H +#define _COMMON_HPACK_ENC_H + +#include <string.h> +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/buf-t.h> +#include <haproxy/http-t.h> + +int hpack_encode_header(struct buffer *out, const struct ist n, + const struct ist v); + +/* Returns the number of bytes required to encode the string length <len>. The + * number of usable bits is an integral multiple of 7 plus 6 for the last byte. + * The maximum number of bytes returned is 4 (2097279 max length). Larger values + * return 0. + */ +static inline int hpack_len_to_bytes(size_t len) +{ + ssize_t slen = len; + + slen -= 127; + if (__builtin_expect(slen < 0, 1)) + return 1; + if (slen < (1 << 14)) { + if (__builtin_expect(slen < (1 << 7), 1)) + return 2; + else + return 3; + } + if (slen < (1 << 21)) + return 4; + return 0; +} + +/* Encodes <len> into <out>+<pos> and return the new position. The caller is + * responsible for checking for available room using hpack_len_to_bytes() + * first. + */ +static inline int hpack_encode_len(char *out, int pos, int len) +{ + int code = len - 127; + + if (code < 0) { + out[pos++] = len; + } else { + out[pos++] = 127; + for (; code >= 128; code >>= 7) + out[pos++] = code | 128; + out[pos++] = code; + } + return pos; +} + +/* Tries to encode header field index <idx> with short value <val> into the + * aligned buffer <out>. Returns non-zero on success, 0 on failure (buffer + * full). The caller is responsible for ensuring that the length of <val> is + * strictly lower than 127, and that <idx> is lower than 64 (static list only), + * and that the buffer is aligned (head==0). + */ +static inline int hpack_encode_short_idx(struct buffer *out, int idx, struct ist val) +{ + if (out->data + 2 + val.len > out->size) + return 0; + + /* literal header field with incremental indexing */ + out->area[out->data++] = idx | 0x40; + out->area[out->data++] = val.len; + ist2bin(&out->area[out->data], val); + out->data += val.len; + return 1; +} + +/* Tries to encode header field index <idx> with long value <val> into the + * aligned buffer <out>. Returns non-zero on success, 0 on failure (buffer + * full). The caller is responsible for ensuring <idx> is lower than 64 (static + * list only), and that the buffer is aligned (head==0). + */ +static inline int hpack_encode_long_idx(struct buffer *out, int idx, struct ist val) +{ + int len = out->data; + + if (!hpack_len_to_bytes(val.len) || + 1 + len + hpack_len_to_bytes(val.len) + val.len > out->size) + return 0; + + /* emit literal with indexing (7541#6.2.1) : + * [ 0 | 1 | Index (6+) ] + */ + out->area[len++] = idx | 0x40; + len = hpack_encode_len(out->area, len, val.len); + memcpy(out->area + len, val.ptr, val.len); + len += val.len; + + out->data = len; + return 1; +} + +/* Tries to encode a :status pseudo-header with the integer status <status> + * into the aligned buffer <out>. Returns non-zero on success, 0 on failure + * (buffer full). The caller is responsible for ensuring that the status is + * comprised between 100 and 999 inclusive and that the buffer is aligned. It's + * inlined because it's easily optimizable by the compiler. + */ +static inline int hpack_encode_int_status(struct buffer *out, unsigned int status) +{ + int len = out->data; + int size = out->size; + unsigned char c = 0; + + /* try to emit a single byte code */ + len++; + if (__builtin_expect(len > size, 0)) + goto fail; + + c = (status <= 304) ? + (status <= 204) ? + (status == 204) ? 0x89 : + (status == 200) ? 0x88 : + 0: /* > 204 */ + (status == 304) ? 0x8b : + (status == 206) ? 0x8a : + 0: + (status <= 404) ? + (status == 404) ? 0x8d : + (status == 400) ? 0x8c : + 0: /* > 404 */ + (status == 500) ? 0x8e : + 0; + + if (c) + goto last; + + /* fall back to literal */ + len += 4; + if (__builtin_expect(len > size, 0)) + goto fail; + + /* basic encoding of the status code */ + out->area[len - 5] = 0x48; // indexed name -- name=":status" (idx 8) + out->area[len - 4] = 0x03; // 3 bytes status + out->area[len - 3] = '0' + status / 100; + out->area[len - 2] = '0' + status / 10 % 10; + c = '0' + status % 10; + last: + out->area[len - 1] = c; + out->data = len; + return 1; + fail: + return 0; +} + +/* Tries to encode a :status pseudo-header with the integer status <status> + * also represented by <str> into the aligned buffer <out>. Returns non-zero + * on success or 0 on failure (buffer full). The caller is responsible for + * ensuring that the status is comprised between 100 and 999 inclusive, that + * <str> contains a valid representation of the numerical value, and that the + * buffer is aligned. This version is preferred when the caller already knows + * a string representation of the status because it avoids the computation in + * the uncompressed case. It's inlined because it's easily optimizable. + */ +static inline int hpack_encode_str_status(struct buffer *out, unsigned int status, struct ist str) +{ + /* don't try too hard, we already have the ASCII value for less common cases */ + if (status == 200 || status == 304) { + if (out->data >= out->size) + return 0; + out->area[out->data] = (status == 304) ? 0x8b : 0x88; + out->data++; + return 1; + } + return hpack_encode_short_idx(out, 8, str); // name=":status" (idx 8) +} + +/* Tries to encode a :method pseudo-header with the method in <meth>, which + * also exists as a string in <str>, into the aligned buffer <out>. Returns + * non-zero on success or 0 on failure (buffer full). The caller is responsible + * for ensuring that the string matches <meth>, that it's smaller than 127 + * bytes, and that the buffer is aligned. If <meth> is unknown then using + * HTTP_METH_OTHER will lead to the string being encoded as a literal. It's + * inlined because it's easily optimizable. + */ +static inline int hpack_encode_method(struct buffer *out, enum http_meth_t meth, struct ist str) +{ + if (out->data < out->size && meth == HTTP_METH_GET) + out->area[out->data++] = 0x82; // indexed field : idx[02]=(":method", "GET") + else if (out->data < out->size && meth == HTTP_METH_POST) + out->area[out->data++] = 0x83; // indexed field : idx[03]=(":method", "POST") + else + return hpack_encode_short_idx(out, 2, str); // name=":method" (idx 2) + return 1; +} + +/* Tries to encode a :scheme pseudo-header with the scheme in <scheme>, into + * the aligned buffer <out>. Returns non-zero on success or 0 on failure + * (buffer full). Only "http" and "https" are recognized and handled as indexed + * values, others are turned into short literals. The caller is responsible for + * ensuring that the scheme is smaller than 127 bytes, and that the buffer is + * aligned. Normally the compiler will detect constant strings in the comparison + * if the code remains inlined. + */ +static inline int hpack_encode_scheme(struct buffer *out, struct ist scheme) +{ + if (out->data < out->size && isteq(scheme, ist("https"))) + out->area[out->data++] = 0x87; // indexed field : idx[07]=(":scheme", "https") + else if (out->data < out->size && isteq(scheme, ist("http"))) + out->area[out->data++] = 0x86; // indexed field : idx[06]=(":scheme", "http") + else + return hpack_encode_short_idx(out, 6, scheme); // name=":scheme" (idx 6) + return 1; +} + +/* Tries to encode a :path pseudo-header with the path in <path>, into the + * aligned buffer <out>. Returns non-zero on success or 0 on failure (buffer + * full). The well-known values "/" and "/index.html" are recognized, and other + * ones are handled as literals. The caller is responsible for ensuring that + * the buffer is aligned. Normally the compiler will detect constant strings + * in the comparison if the code remains inlined. + */ +static inline int hpack_encode_path(struct buffer *out, struct ist path) +{ + if (out->data < out->size && isteq(path, ist("/"))) + out->area[out->data++] = 0x84; // indexed field : idx[04]=(":path", "/") + else if (out->data < out->size && isteq(path, ist("/index.html"))) + out->area[out->data++] = 0x85; // indexed field : idx[05]=(":path", "/index.html") + else if (path.len < 127) + return hpack_encode_short_idx(out, 4, path); // name=":path" (idx 4) + else + return hpack_encode_long_idx(out, 4, path); // name=":path" (idx 4) + return 1; +} + + +#endif /* _COMMON_HPACK_ENC_H */ diff --git a/include/haproxy/hpack-huff.h b/include/haproxy/hpack-huff.h new file mode 100644 index 0000000..f939103 --- /dev/null +++ b/include/haproxy/hpack-huff.h @@ -0,0 +1,35 @@ +/* + * Huffman decoding and encoding for HPACK (RFC7541) + * + * Copyright (C) 2014-2020 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_HPACK_HUFF_H +#define _HAPROXY_HPACK_HUFF_H + +#include <inttypes.h> + +int huff_enc(const char *s, char *out); +int huff_dec(const uint8_t *huff, int hlen, char *out, int olen); + +#endif /* _HAPROXY_HPACK_HUFF_H */ diff --git a/include/haproxy/hpack-tbl-t.h b/include/haproxy/hpack-tbl-t.h new file mode 100644 index 0000000..4e5d536 --- /dev/null +++ b/include/haproxy/hpack-tbl-t.h @@ -0,0 +1,143 @@ +/* + * HPACK header table management (RFC7541) - type definitions + * + * Copyright (C) 2014-2020 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_HPACK_TBL_T_H +#define _HAPROXY_HPACK_TBL_T_H + +#include <inttypes.h> + +/* Dynamic Headers Table, usable for tables up to 4GB long and values of 64kB-1. + * The model can be improved by using offsets relative to the table entry's end + * or to the end of the area, or by moving the descriptors at the end of the + * table and the data at the beginning. This entry is 8 bytes long, which is 1/4 + * of the bookkeeping planned by the HPACK spec. Thus it saves 24 bytes per + * header field, meaning that even with a single header, 24 extra bytes can be + * stored (ie one such descriptor). At 29.2 average bytes per header field as + * found in the hpack test case, that's slightly more than 1.5kB of space saved + * from a 4kB block, resulting in contiguous space almost always being + * available. + * + * Principle: the table is stored in a contiguous array containing both the + * descriptors and the contents. Descriptors are stored at the beginning of the + * array while contents are stored starting from the end. Most of the time there + * is enough room left in the table to insert a new header field, thanks to the + * savings on the descriptor size. Thus by inserting headers from the end it's + * possible to maximize the delay before a collision of DTEs and data. In order + * to always insert from the right, we need to keep a reference to the latest + * inserted element and look before it. The last inserted cell's address defines + * the lowest known address still in use, unless the area wraps in which case + * the available space lies between the end of the tail and the beginning of the + * head. + * + * In order to detect collisions between data blocks and DTEs, we also maintain + * an index to the lowest element facing the DTE table, called "front". This one + * is updated each time an element is inserted before it. Once the buffer wraps, + * this element doesn't have to be updated anymore until it is released, in + * which case the buffer doesn't wrap anymore and the front element becomes the + * head again. + * + * Various heuristics are possible concerning the opportunity to wrap the + * entries to limit the risk of collisions with the DTE, but experimentation + * shows that thanks to the important savings made on the descriptors, the + * likeliness of finding a large amount of free space at the end of the area is + * much higher than the risk of colliding, so in the end the most naive + * algorithms work pretty fine. Typical ratios of 1 collision per 2000 requests + * have been observed. + * + * The defragmentation should be rare ; a study on live data shows on average + * 29.2 bytes used per header field. This plus the 32 bytes overhead fix an + * average of 66.9 header fields per 4kB table. This brings a 1606 bytes saving + * using the current storage description, ensuring that oldest headers are + * linearly removed by the sender before fragmentation occurs. This means that + * for all smaller header fields there will not be any requirement to defragment + * the area and most of the time it will even be possible to copy the old values + * directly within the buffer after creating a new entry. On average within the + * available space there will be enough room to store 1606/(29.2+8)=43 extra + * header fields without switching to another place. + * + * The table header fits in the table itself, it only takes 16 bytes, so in the + * worst case (1 single header) it's possible to store 4096 - 16 - 8 = 4072 + * data bytes, which is larger than the 4064 the protocol requires (4096 - 32). + */ + +/* + * Gcc before 3.0 needs [0] to declare a variable-size array + */ +#ifndef VAR_ARRAY +#if defined(__GNUC__) && (__GNUC__ < 3) +#define VAR_ARRAY 0 +#else +#define VAR_ARRAY +#endif +#endif + +/* One dynamic table entry descriptor */ +struct hpack_dte { + uint32_t addr; /* storage address, relative to the dte address */ + uint16_t nlen; /* header name length */ + uint16_t vlen; /* header value length */ +}; + +/* Note: the table's head plus a struct hpack_dte must be smaller than or equal to 32 + * bytes so that a single large header can always fit. Here that's 16 bytes for + * the header, plus 8 bytes per slot. + * Note that when <used> == 0, front, head, and wrap are undefined. + */ +struct hpack_dht { + uint32_t size; /* allocated table size in bytes */ + uint32_t total; /* sum of nlen + vlen in bytes */ + uint16_t front; /* slot number of the first node after the idx table */ + uint16_t wrap; /* number of allocated slots, wraps here */ + uint16_t head; /* last inserted slot number */ + uint16_t used; /* number of slots in use */ + struct hpack_dte dte[VAR_ARRAY]; /* dynamic table entries */ +}; + +/* supported hpack encoding/decoding errors */ +enum { + HPACK_ERR_NONE = 0, /* no error */ + HPACK_ERR_ALLOC_FAIL, /* memory allocation error */ + HPACK_ERR_UNKNOWN_OPCODE, /* invalid first byte */ + HPACK_ERR_TRUNCATED, /* truncated stream */ + HPACK_ERR_HUFFMAN, /* huffman decoding error */ + HPACK_ERR_INVALID_PHDR, /* invalid pseudo header field name */ + HPACK_ERR_MISPLACED_PHDR, /* pseudo header field after a regular header field */ + HPACK_ERR_DUPLICATE_PHDR, /* duplicate pseudo header field */ + HPACK_ERR_DHT_INSERT_FAIL, /* failed to insert into DHT */ + HPACK_ERR_TOO_LARGE, /* decoded request/response is too large */ + HPACK_ERR_MISSING_METHOD, /* :method is missing */ + HPACK_ERR_MISSING_SCHEME, /* :scheme is missing */ + HPACK_ERR_MISSING_PATH, /* :path is missing */ + HPACK_ERR_MISSING_AUTHORITY, /* :authority is missing with CONNECT */ + HPACK_ERR_SCHEME_NOT_ALLOWED, /* :scheme not allowed with CONNECT */ + HPACK_ERR_PATH_NOT_ALLOWED, /* :path not allowed with CONNECT */ + HPACK_ERR_INVALID_ARGUMENT, /* an invalid argument was passed */ +}; + +/* static header table as in RFC7541 Appendix A. [0] unused. */ +#define HPACK_SHT_SIZE 62 + +#endif /* _HAPROXY_HPACK_TBL_T_H */ diff --git a/include/haproxy/hpack-tbl.h b/include/haproxy/hpack-tbl.h new file mode 100644 index 0000000..02cf7db --- /dev/null +++ b/include/haproxy/hpack-tbl.h @@ -0,0 +1,184 @@ +/* + * HPACK header table management (RFC7541) - prototypes + * + * Copyright (C) 2014-2020 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_HPACK_TBL_H +#define _HAPROXY_HPACK_TBL_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/hpack-tbl-t.h> +#include <haproxy/http-hdr-t.h> + +/* when built outside of haproxy, HPACK_STANDALONE must be defined, and + * pool_head_hpack_tbl->size must be set to the DHT size. + */ +#ifndef HPACK_STANDALONE +#include <haproxy/pool.h> +#define hpack_alloc(pool) pool_alloc(pool) +#define hpack_free(pool, ptr) pool_free(pool, ptr) +#else +#include <stdlib.h> +#include <haproxy/pool-t.h> +#define hpack_alloc(pool) malloc(pool->size) +#define hpack_free(pool, ptr) free(ptr) +#endif + +extern const struct http_hdr hpack_sht[HPACK_SHT_SIZE]; +extern struct pool_head *pool_head_hpack_tbl; + +int __hpack_dht_make_room(struct hpack_dht *dht, unsigned int needed); +int hpack_dht_insert(struct hpack_dht *dht, struct ist name, struct ist value); + +#ifdef DEBUG_HPACK +void hpack_dht_dump(FILE *out, const struct hpack_dht *dht); +void hpack_dht_check_consistency(const struct hpack_dht *dht); +#endif + +/* return a pointer to the entry designated by index <idx> (starting at 1) or + * NULL if this index is not there. + */ +static inline const struct hpack_dte *hpack_get_dte(const struct hpack_dht *dht, uint16_t idx) +{ + idx--; + + if (idx >= dht->used) + return NULL; + + if (idx <= dht->head) + idx = dht->head - idx; + else + idx = dht->head - idx + dht->wrap; + + return &dht->dte[idx]; +} + +/* returns non-zero if <idx> is valid for table <dht> */ +static inline int hpack_valid_idx(const struct hpack_dht *dht, uint32_t idx) +{ + return idx < dht->used + HPACK_SHT_SIZE; +} + +/* return a pointer to the header name for entry <dte>. */ +static inline struct ist hpack_get_name(const struct hpack_dht *dht, const struct hpack_dte *dte) +{ + struct ist ret = { + .ptr = (void *)dht + dte->addr, + .len = dte->nlen, + }; + return ret; +} + +/* return a pointer to the header value for entry <dte>. */ +static inline struct ist hpack_get_value(const struct hpack_dht *dht, const struct hpack_dte *dte) +{ + struct ist ret = { + .ptr = (void *)dht + dte->addr + dte->nlen, + .len = dte->vlen, + }; + return ret; +} + +/* takes an idx, returns the associated name */ +static inline struct ist hpack_idx_to_name(const struct hpack_dht *dht, uint32_t idx) +{ + const struct hpack_dte *dte; + + if (idx < HPACK_SHT_SIZE) + return hpack_sht[idx].n; + + dte = hpack_get_dte(dht, idx - HPACK_SHT_SIZE + 1); + if (!dte) + return ist("### ERR ###"); // error + + return hpack_get_name(dht, dte); +} + +/* takes an idx, returns the associated value */ +static inline struct ist hpack_idx_to_value(const struct hpack_dht *dht, uint32_t idx) +{ + const struct hpack_dte *dte; + + if (idx < HPACK_SHT_SIZE) + return hpack_sht[idx].v; + + dte = hpack_get_dte(dht, idx - HPACK_SHT_SIZE + 1); + if (!dte) + return ist("### ERR ###"); // error + + return hpack_get_value(dht, dte); +} + +/* returns the slot number of the oldest entry (tail). Must not be used on an + * empty table. + */ +static inline unsigned int hpack_dht_get_tail(const struct hpack_dht *dht) +{ + return ((dht->head + 1U < dht->used) ? dht->wrap : 0) + dht->head + 1U - dht->used; +} + +/* Purges table dht until a header field of <needed> bytes fits according to + * the protocol (adding 32 bytes overhead). Returns non-zero on success, zero + * on failure (ie: table empty but still not sufficient). + */ +static inline int hpack_dht_make_room(struct hpack_dht *dht, unsigned int needed) +{ + if (dht->used * 32 + dht->total + needed + 32 <= dht->size) + return 1; + else if (!dht->used) + return 0; + + return __hpack_dht_make_room(dht, needed); +} + +/* allocate a dynamic headers table of <size> bytes and return it initialized */ +static inline void hpack_dht_init(struct hpack_dht *dht, uint32_t size) +{ + dht->size = size; + dht->total = 0; + dht->used = 0; +} + +/* allocate a dynamic headers table from the pool and return it initialized */ +static inline struct hpack_dht *hpack_dht_alloc() +{ + struct hpack_dht *dht; + + if (unlikely(!pool_head_hpack_tbl)) + return NULL; + + dht = hpack_alloc(pool_head_hpack_tbl); + if (dht) + hpack_dht_init(dht, pool_head_hpack_tbl->size); + return dht; +} + +/* free a dynamic headers table */ +static inline void hpack_dht_free(struct hpack_dht *dht) +{ + hpack_free(pool_head_hpack_tbl, dht); +} + +#endif /* _HAPROXY_HPACK_TBL_H */ diff --git a/include/haproxy/hq_interop.h b/include/haproxy/hq_interop.h new file mode 100644 index 0000000..eb6ebf6 --- /dev/null +++ b/include/haproxy/hq_interop.h @@ -0,0 +1,6 @@ +#ifndef _HAPROXY_HQ_INTEROP_H_ +#define _HAPROXY_HQ_INTEROP_H_ + +extern const struct qcc_app_ops hq_interop_ops; + +#endif /* _HAPROXY_HQ_INTEROP_H_ */ diff --git a/include/haproxy/http-hdr-t.h b/include/haproxy/http-hdr-t.h new file mode 100644 index 0000000..3534f43 --- /dev/null +++ b/include/haproxy/http-hdr-t.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/http-hdr-t.h + * HTTP header management (new model) - type definitions + * + * Copyright (C) 2014-2020 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_HTTP_HDR_T_H +#define _HAPROXY_HTTP_HDR_T_H + +#include <import/ist.h> + +/* a header field made of a name and a value. Such structure stores 4 longs so + * it takes 16 bytes on 32-bit systems and 32 bytes on 64-bit systems. + */ +struct http_hdr { + struct ist n; /* name */ + struct ist v; /* value */ +}; + +#endif /* _HAPROXY_HTTP_HDR_T_H */ diff --git a/include/haproxy/http-hdr.h b/include/haproxy/http-hdr.h new file mode 100644 index 0000000..e9e253b --- /dev/null +++ b/include/haproxy/http-hdr.h @@ -0,0 +1,60 @@ +/* + * include/haproxy/http-hdr.h + * HTTP header management (new model) - functions + * + * Copyright (C) 2014-2017 Willy Tarreau <willy@haproxy.org> + * Copyright (C) 2017 HAProxy Technologies + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_HTTP_HDR_H +#define _HAPROXY_HTTP_HDR_H + +#include <import/ist.h> +#include <haproxy/http-hdr-t.h> + +/* sets an http_hdr <hdr> to name <n> and value <v>. Useful to avoid casts in + * immediate assignments. + */ +static inline void http_set_hdr(struct http_hdr *hdr, const struct ist n, const struct ist v) +{ + hdr->n = n; + hdr->v = v; +} + +/* removes all occurrences of header name <n> in list <hdr> and returns the new count. The + * list must be terminated by the empty header. + */ +static inline int http_del_hdr(struct http_hdr *hdr, const struct ist n) +{ + int src = 0, dst = 0; + + do { + if (!isteqi(hdr[src].n, n)) { + if (src != dst) + hdr[dst] = hdr[src]; + dst++; + } + } while (hdr[src++].n.len); + + return dst; +} +#endif /* _HAPROXY_HTTP_HDR_H */ diff --git a/include/haproxy/http-t.h b/include/haproxy/http-t.h new file mode 100644 index 0000000..3165082 --- /dev/null +++ b/include/haproxy/http-t.h @@ -0,0 +1,184 @@ +/* + * include/haproxy/http-t.h + * + * Version-agnostic and implementation-agnostic HTTP protocol definitions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_T_H +#define _HAPROXY_HTTP_T_H + +#include <inttypes.h> +#include <import/ist.h> +#include <haproxy/buf-t.h> + +/* + * some macros mainly used when parsing header fields. + * from RFC7230: + * CTL = <any US-ASCII control character (octets 0 - 31) and DEL (127)> + * SEP = one of the 17 defined separators or SP or HT + * LWS = CR, LF, SP or HT + * SPHT = SP or HT. Use this macro and not a boolean expression for best speed. + * CRLF = CR or LF. Use this macro and not a boolean expression for best speed. + * token = any CHAR except CTL or SEP. Use this macro and not a boolean expression for best speed. + * + * added for ease of use: + * ver_token = 'H', 'P', 'T', '/', '.', and digits. + */ +#define HTTP_FLG_CTL 0x01 +#define HTTP_FLG_SEP 0x02 +#define HTTP_FLG_LWS 0x04 +#define HTTP_FLG_SPHT 0x08 +#define HTTP_FLG_CRLF 0x10 +#define HTTP_FLG_TOK 0x20 +#define HTTP_FLG_VER 0x40 +#define HTTP_FLG_DIG 0x80 + +#define HTTP_IS_CTL(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_CTL) +#define HTTP_IS_SEP(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_SEP) +#define HTTP_IS_LWS(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_LWS) +#define HTTP_IS_SPHT(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_SPHT) +#define HTTP_IS_CRLF(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_CRLF) +#define HTTP_IS_TOKEN(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_TOK) +#define HTTP_IS_VER_TOKEN(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_VER) +#define HTTP_IS_DIGIT(x) (http_char_classes[(uint8_t)(x)] & HTTP_FLG_DIG) + +/* Known HTTP methods */ +enum http_meth_t { + HTTP_METH_OPTIONS, + HTTP_METH_GET, + HTTP_METH_HEAD, + HTTP_METH_POST, + HTTP_METH_PUT, + HTTP_METH_DELETE, + HTTP_METH_TRACE, + HTTP_METH_CONNECT, + HTTP_METH_OTHER, /* Must be the last entry */ +} __attribute__((packed)); + +/* Known HTTP authentication schemes */ +enum ht_auth_m { + HTTP_AUTH_WRONG = -1, /* missing or unknown */ + HTTP_AUTH_UNKNOWN = 0, + HTTP_AUTH_BASIC, + HTTP_AUTH_DIGEST, + HTTP_AUTH_BEARER, +} __attribute__((packed)); + +/* All implemented HTTP status codes */ +enum { + HTTP_ERR_200 = 0, + HTTP_ERR_400, + HTTP_ERR_401, + HTTP_ERR_403, + HTTP_ERR_404, + HTTP_ERR_405, + HTTP_ERR_407, + HTTP_ERR_408, + HTTP_ERR_410, + HTTP_ERR_413, + HTTP_ERR_421, + HTTP_ERR_422, + HTTP_ERR_425, + HTTP_ERR_429, + HTTP_ERR_500, + HTTP_ERR_501, + HTTP_ERR_502, + HTTP_ERR_503, + HTTP_ERR_504, + HTTP_ERR_SIZE +}; + +/* Note: the strings below make use of chunks. Chunks may carry an allocated + * size in addition to the length. The size counts from the beginning (str) + * to the end. If the size is unknown, it MUST be zero, in which case the + * sample will automatically be duplicated when a change larger than <len> has + * to be performed. Thus it is safe to always set size to zero. + */ +struct http_meth { + enum http_meth_t meth; + struct buffer str; +}; + +struct http_auth_data { + enum ht_auth_m method; /* one of HTTP_AUTH_* */ + /* 7 bytes unused here */ + struct buffer method_data; /* points to the creditial part from 'Authorization:' header */ + char *user, *pass; /* extracted username & password */ +}; + +struct http_method_desc { + enum http_meth_t meth; + const struct ist text; +}; + +enum http_etag_type { + ETAG_INVALID = 0, + ETAG_STRONG, + ETAG_WEAK +}; + +/* Indicates what elements have been parsed in a HTTP URI. */ +enum http_uri_parser_state { + URI_PARSER_STATE_BEFORE = 0, + URI_PARSER_STATE_SCHEME_DONE, + URI_PARSER_STATE_AUTHORITY_DONE, + URI_PARSER_STATE_PATH_DONE, +}; + +/* HTTP URI format as described in rfc 7230 5.3. + * As the first character is used to identify the format, absolute-form and + * authority-form are not differentiated. + */ +enum http_uri_parser_format { + URI_PARSER_FORMAT_EMPTY, + URI_PARSER_FORMAT_ASTERISK, + URI_PARSER_FORMAT_ABSPATH, + URI_PARSER_FORMAT_ABSURI_OR_AUTHORITY, +}; + +/* Parser context for a HTTP URI. Must be initialized with http_uri_parser_init + * before its usage. + * + * The parser API is not idempotent. For an initialized parser instance, each + * URI element can be extracted only once using its related function : + * - http_parse_scheme + * - http_parse_authority + * - http_parse_path + * + * Also each element must be extracted in the order of its appearance in the + * URI according to the rfc 3986. However, it is possible to skip the parsing + * of elements which are of no interest. + * + * If the above rules are not respected, the parsing functions return an empty + * ist. + */ +struct http_uri_parser { + struct ist uri; /* HTTP URI for parsing */ + enum http_uri_parser_state state; /* already parsed HTTP URI elements */ + enum http_uri_parser_format format; /* rfc 7230 5.3 HTTP URI format */ +}; + +#endif /* _HAPROXY_HTTP_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/http.h b/include/haproxy/http.h new file mode 100644 index 0000000..2992640 --- /dev/null +++ b/include/haproxy/http.h @@ -0,0 +1,222 @@ +/* + * include/haproxy/http.h + * + * Functions for version-agnostic and implementation-agnostic HTTP protocol. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_H +#define _HAPROXY_HTTP_H + +#include <string.h> +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/http-t.h> + +extern const int http_err_codes[HTTP_ERR_SIZE]; +extern const char *http_err_msgs[HTTP_ERR_SIZE]; +extern const struct ist http_known_methods[HTTP_METH_OTHER]; +extern const uint8_t http_char_classes[256]; + +enum http_meth_t find_http_meth(const char *str, const int len); +int http_get_status_idx(unsigned int status); +const char *http_get_reason(unsigned int status); +struct ist http_get_host_port(const struct ist host); +int http_is_default_port(const struct ist schm, const struct ist port); +int http_validate_scheme(const struct ist schm); +struct ist http_parse_scheme(struct http_uri_parser *parser); +struct ist http_parse_authority(struct http_uri_parser *parser, int no_userinfo); +struct ist http_parse_path(struct http_uri_parser *parser); +int http_parse_cont_len_header(struct ist *value, unsigned long long *body_len, + int not_first); +int http_header_match2(const char *hdr, const char *end, + const char *name, int len); +char *http_find_hdr_value_end(char *s, const char *e); +char *http_find_cookie_value_end(char *s, const char *e); +char *http_extract_cookie_value(char *hdr, const char *hdr_end, + char *cookie_name, size_t cookie_name_l, + int list, char **value, size_t *value_l); +char *http_extract_next_cookie_name(char *hdr_beg, char *hdr_end, int is_req, + char **ptr, size_t *len); +int http_parse_qvalue(const char *qvalue, const char **end); +const char *http_find_url_param_pos(const char **chunks, + const char* url_param_name, + size_t url_param_name_l, char delim, char insensitive); +int http_find_next_url_param(const char **chunks, + const char* url_param_name, size_t url_param_name_l, + const char **vstart, const char **vend, char delim, char insensitive); + +int http_parse_header(const struct ist hdr, struct ist *name, struct ist *value); +int http_parse_stline(const struct ist line, struct ist *p1, struct ist *p2, struct ist *p3); +int http_parse_status_val(const struct ist value, struct ist *status, struct ist *reason); + +int http_compare_etags(struct ist etag1, struct ist etag2); + +struct ist http_trim_leading_spht(struct ist value); +struct ist http_trim_trailing_spht(struct ist value); + +/* + * Given a path string and its length, find the position of beginning of the + * query string. Returns NULL if no query string is found in the path. + * + * Example: if path = "/foo/bar/fubar?yo=mama;ye=daddy", and n = 22: + * + * find_query_string(path, n, '?') points to "yo=mama;ye=daddy" string. + */ +static inline char *http_find_param_list(char *path, size_t path_l, char delim) +{ + char *p; + + p = memchr(path, delim, path_l); + return p ? p + 1 : NULL; +} + +static inline int http_is_param_delimiter(char c, char delim) +{ + return c == '&' || c == ';' || c == delim; +} + +/* Match language range with language tag. RFC2616 14.4: + * + * A language-range matches a language-tag if it exactly equals + * the tag, or if it exactly equals a prefix of the tag such + * that the first tag character following the prefix is "-". + * + * Return 1 if the strings match, else return 0. + */ +static inline int http_language_range_match(const char *range, int range_len, + const char *tag, int tag_len) +{ + const char *end = range + range_len; + const char *tend = tag + tag_len; + + while (range < end) { + if (*range == '-' && tag == tend) + return 1; + if (*range != *tag || tag == tend) + return 0; + range++; + tag++; + } + /* Return true only if the last char of the tag is matched. */ + return tag == tend; +} + +static inline enum http_etag_type http_get_etag_type(const struct ist etag) +{ + /* An ETag must be at least 2 characters. */ + if (etag.len < 2) + return ETAG_INVALID; + + /* The last character must be a `"`. */ + if (etag.ptr[etag.len - 1] != '"') + return ETAG_INVALID; + + /* If the ETag starts with a `"` then it is a strong ETag. */ + if (etag.ptr[0] == '"') + return ETAG_STRONG; + + /* If the ETag starts with `W/"` then it is a weak ETag. */ + if (istnmatch(etag, ist("W/\""), 3)) + return ETAG_WEAK; + + return ETAG_INVALID; +} + +/* Initialize a HTTP URI parser to use it with http URI parsing functions. The + * URI format is detected according to its first character. + */ +static inline struct http_uri_parser http_uri_parser_init(const struct ist uri) +{ + struct http_uri_parser parser = { + .uri = uri, + .state = URI_PARSER_STATE_BEFORE, + }; + + /* RFC7230, par. 2.7 : + * Request-URI = "*" | absuri | abspath | authority + */ + + if (!istlen(parser.uri)) { + parser.format = URI_PARSER_FORMAT_EMPTY; + } + else { + /* detect the format according to the first URI character */ + switch (*istptr(parser.uri)) { + case '*': + parser.format = URI_PARSER_FORMAT_ASTERISK; + break; + + case '/': + parser.format = URI_PARSER_FORMAT_ABSPATH; + break; + + default: + parser.format = URI_PARSER_FORMAT_ABSURI_OR_AUTHORITY; + break; + } + } + + return parser; +} + +/* Looks into <ist> for forbidden characters for header values (0x00, 0x0A, + * 0x0D), starting at pointer <start> which must be within <ist>. Returns + * non-zero if such a character is found, 0 otherwise. When run on unlikely + * header match, it's recommended to first check for the presence of control + * chars using ist_find_ctl(). + */ +static inline int http_header_has_forbidden_char(const struct ist ist, const char *start) +{ + do { + if ((uint8_t)*start <= 0x0d && + (1U << (uint8_t)*start) & ((1<<13) | (1<<10) | (1<<0))) + return 1; + start++; + } while (start < istend(ist)); + return 0; +} + +/* Looks into <ist> for forbidden characters for :path values (0x00..0x1F, + * 0x20, 0x23), starting at pointer <start> which must be within <ist>. + * Returns non-zero if such a character is found, 0 otherwise. When run on + * unlikely header match, it's recommended to first check for the presence + * of control chars using ist_find_ctl(). + */ +static inline int http_path_has_forbidden_char(const struct ist ist, const char *start) +{ + do { + if ((uint8_t)*start <= 0x23) { + if ((uint8_t)*start < 0x20) + return 1; + if ((1U << ((uint8_t)*start & 0x1F)) & ((1<<3) | (1<<0))) + return 1; + } + start++; + } while (start < istend(ist)); + return 0; +} + +#endif /* _HAPROXY_HTTP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/http_ana-t.h b/include/haproxy/http_ana-t.h new file mode 100644 index 0000000..5b7342f --- /dev/null +++ b/include/haproxy/http_ana-t.h @@ -0,0 +1,264 @@ +/* + * include/haproxy/http_ana-t.h + * This file contains HTTP protocol definitions. + * + * Copyright (C) 2000-2011 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTO_HTTP_T_H +#define _HAPROXY_PROTO_HTTP_T_H + +#include <haproxy/api-t.h> +#include <haproxy/channel-t.h> +#include <haproxy/http-t.h> + +/* These are the flags that are found in txn->flags */ + +/* action flags. + * Please also update the txn_show_flags() function below in case of changes. + */ +/* Unused: 0x00000001..0x00000004 */ +#define TX_CONST_REPLY 0x00000008 /* The http reply must not be rewritten (don't eval after-response ruleset) */ +#define TX_CLTARPIT 0x00000010 /* the transaction is tarpitted (anti-dos) */ + +/* transaction flags dedicated to cookies : bits values 0x20 to 0x80 (0-7 shift 5) */ +#define TX_CK_NONE 0x00000000 /* this transaction had no cookie */ +#define TX_CK_INVALID 0x00000020 /* this transaction had a cookie which matches no server */ +#define TX_CK_DOWN 0x00000040 /* this transaction had cookie matching a down server */ +#define TX_CK_VALID 0x00000060 /* this transaction had cookie matching a valid server */ +#define TX_CK_EXPIRED 0x00000080 /* this transaction had an expired cookie (idle for too long) */ +#define TX_CK_OLD 0x000000A0 /* this transaction had too old a cookie (offered too long ago) */ +#define TX_CK_UNUSED 0x000000C0 /* this transaction had a cookie but it was not used (eg: use-server was preferred) */ +#define TX_CK_MASK 0x000000E0 /* mask to get this transaction's cookie flags */ +#define TX_CK_SHIFT 5 /* bit shift */ + +/* response cookie information, bits values 0x100 to 0x700 (0-7 shift 8) */ +#define TX_SCK_NONE 0x00000000 /* no cookie found in the response */ +#define TX_SCK_FOUND 0x00000100 /* a persistence cookie was found and forwarded */ +#define TX_SCK_DELETED 0x00000200 /* an existing persistence cookie was deleted */ +#define TX_SCK_INSERTED 0x00000300 /* a persistence cookie was inserted */ +#define TX_SCK_REPLACED 0x00000400 /* a persistence cookie was present and rewritten */ +#define TX_SCK_UPDATED 0x00000500 /* an expirable persistence cookie was updated */ +#define TX_SCK_MASK 0x00000700 /* mask to get the set-cookie field */ +#define TX_SCK_SHIFT 8 /* bit shift */ + +#define TX_SCK_PRESENT 0x00000800 /* a cookie was found in the server's response */ + +/* cacheability management, bits values 0x1000 to 0x3000 (0-3 shift 12) */ +#define TX_CACHEABLE 0x00001000 /* at least part of the response is cacheable */ +#define TX_CACHE_COOK 0x00002000 /* a cookie in the response is cacheable */ +#define TX_CACHE_IGNORE 0x00004000 /* do not retrieve object from cache */ +#define TX_CACHE_SHIFT 12 /* bit shift */ + +#define TX_CON_WANT_TUN 0x00008000 /* Will be a tunnel (CONNECT or 101-Switching-Protocol) */ + +#define TX_CACHE_HAS_SEC_KEY 0x00010000 /* secondary key building succeeded */ + +#define TX_USE_PX_CONN 0x00020000 /* Use "Proxy-Connection" instead of "Connection" */ + +/* used only for keep-alive purposes, to indicate we're on a second transaction */ +#define TX_NOT_FIRST 0x00040000 /* the transaction is not the first one */ + +#define TX_L7_RETRY 0x000800000 /* The transaction may attempt L7 retries */ +#define TX_D_L7_RETRY 0x001000000 /* Disable L7 retries on this transaction, even if configured to do it */ + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG and __APPEND_ENUM macros. The new end of the buffer is + * returned. + */ +static forceinline char *txn_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) +#define _e(m, e, ...) __APPEND_ENUM(buf, len, delim, flg, m, e, #e, __VA_ARGS__) + /* prologue */ + _(0); + /* flags & enums */ + _(TX_SCK_PRESENT, _(TX_CACHEABLE, _(TX_CACHE_COOK, _(TX_CACHE_IGNORE, + _(TX_CON_WANT_TUN, _(TX_CACHE_HAS_SEC_KEY, _(TX_USE_PX_CONN, + _(TX_NOT_FIRST, _(TX_L7_RETRY, _(TX_D_L7_RETRY)))))))))); + + _e(TX_SCK_MASK, TX_SCK_FOUND, _e(TX_SCK_MASK, TX_SCK_DELETED, + _e(TX_SCK_MASK, TX_SCK_INSERTED, _e(TX_SCK_MASK, TX_SCK_REPLACED, + _e(TX_SCK_MASK, TX_SCK_UPDATED))))); + + _e(TX_CK_MASK, TX_CK_INVALID, _e(TX_CK_MASK, TX_CK_DOWN, + _e(TX_CK_MASK, TX_CK_VALID, _e(TX_CK_MASK, TX_CK_EXPIRED, + _e(TX_CK_MASK, TX_CK_OLD, _e(TX_CK_MASK, TX_CK_UNUSED)))))); + + _(TX_CONST_REPLY, _(TX_CLTARPIT)); + /* epilogue */ + _(~0U); + return buf; +#undef _e +#undef _ +} + + +/* + * HTTP message status flags (msg->flags). + * Please also update the txn_show_flags() function below in case of changes. + */ +#define HTTP_MSGF_CNT_LEN 0x00000001 /* content-length was found in the message */ +#define HTTP_MSGF_TE_CHNK 0x00000002 /* transfer-encoding: chunked was found */ + +/* if this flags is not set in either direction, we may be forced to complete a + * connection as a half-way tunnel (eg if no content-length appears in a 1.1 + * response, but the request is correctly sized) + */ +#define HTTP_MSGF_XFER_LEN 0x00000004 /* message xfer size can be determined */ +#define HTTP_MSGF_VER_11 0x00000008 /* the message is HTTP/1.1 or above */ + +#define HTTP_MSGF_SOFT_RW 0x00000010 /* soft header rewrites, no error triggered */ + +#define HTTP_MSGF_COMPRESSING 0x00000020 /* data compression is in progress */ + +#define HTTP_MSGF_BODYLESS 0x00000040 /* The message has no body (content-length = 0) */ +#define HTTP_MSGF_CONN_UPG 0x00000080 /* The message contains "Connection: Upgrade" header */ + +#define HTTP_MSGF_EXPECT_CHECKED 0x00000100 /* Expect header was already handled, if any */ + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *hmsg_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(HTTP_MSGF_CNT_LEN, _(HTTP_MSGF_TE_CHNK, _(HTTP_MSGF_XFER_LEN, + _(HTTP_MSGF_VER_11, _(HTTP_MSGF_SOFT_RW, _(HTTP_MSGF_COMPRESSING, + _(HTTP_MSGF_BODYLESS, _(HTTP_MSGF_CONN_UPG, _(HTTP_MSGF_EXPECT_CHECKED))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + + +/* Maximum length of the cache secondary key (sum of all the possible parts of + * the secondary key). The actual keys might be smaller for some + * request/response pairs, because they depend on the responses' optional Vary + * header. The different sizes can be found in the vary_information object (see + * cache.c).*/ +#define HTTP_CACHE_SEC_KEY_LEN (sizeof(uint32_t)+sizeof(uint64_t)+sizeof(uint64_t)) + + +/* Redirect flags */ +enum { + REDIRECT_FLAG_NONE = 0, + REDIRECT_FLAG_DROP_QS = 1, /* drop query string */ + REDIRECT_FLAG_APPEND_SLASH = 2, /* append a slash if missing at the end */ + REDIRECT_FLAG_FROM_REQ = 4, /* redirect rule on the request path */ + REDIRECT_FLAG_IGNORE_EMPTY = 8, /* silently ignore empty location expressions */ +}; + +/* Redirect types (location, prefix, extended ) */ +enum { + REDIRECT_TYPE_NONE = 0, /* no redirection */ + REDIRECT_TYPE_LOCATION, /* location redirect */ + REDIRECT_TYPE_PREFIX, /* prefix redirect */ + REDIRECT_TYPE_SCHEME, /* scheme redirect (eg: switch from http to https) */ +}; + +/* Persist types (force-persist, ignore-persist) */ +enum { + PERSIST_TYPE_NONE = 0, /* no persistence */ + PERSIST_TYPE_FORCE, /* force-persist */ + PERSIST_TYPE_IGNORE, /* ignore-persist */ +}; + +/* final results for http-request rules */ +enum rule_result { + HTTP_RULE_RES_CONT = 0, /* nothing special, continue rules evaluation */ + HTTP_RULE_RES_YIELD, /* call me later because some data is missing. */ + HTTP_RULE_RES_STOP, /* stopped processing on an accept */ + HTTP_RULE_RES_DENY, /* deny (or tarpit if TX_CLTARPIT) */ + HTTP_RULE_RES_ABRT, /* abort request, msg already sent (eg: auth) */ + HTTP_RULE_RES_DONE, /* processing done, stop processing (eg: redirect) */ + HTTP_RULE_RES_BADREQ, /* bad request */ + HTTP_RULE_RES_ERROR, /* Internal error */ +}; + +/* Legacy version of the HTTP/1 message state, used by the channels, should + * ultimately be removed. + */ +enum h1_state { + HTTP_MSG_RQBEFORE = 0, // request: leading LF, before start line + HTTP_MSG_RPBEFORE = 1, // response: leading LF, before start line + + /* Body processing. + * The state HTTP_MSG_BODY is a delimiter to know if we're waiting for headers + * or body. All the sub-states below also indicate we're processing the body, + * with some additional information. + */ + HTTP_MSG_BODY = 2, // parsing body at end of headers + HTTP_MSG_DATA = 3, // skipping data chunk / content-length data + /* we enter this state when we've received the end of the current message */ + HTTP_MSG_ENDING = 4, // message end received, wait that the filters end too + HTTP_MSG_DONE = 5, // message end received, waiting for resync or close + HTTP_MSG_CLOSING = 6, // shutdown_w done, not all bytes sent yet + HTTP_MSG_CLOSED = 7, // shutdown_w done, all bytes sent + HTTP_MSG_TUNNEL = 8, // tunneled data after DONE +} __attribute__((packed)); + + +/* This is the state of an HTTP seen from the analyzers point of view. It can be + * either a request message or a response message. + */ +struct http_msg { + enum h1_state msg_state; /* where we are in the current message parsing */ + /* 3 bytes unused here */ + unsigned int flags; /* flags describing the message (HTTP version, ...) */ + struct channel *chn; /* pointer to the channel transporting the message */ +}; + + +/* This is an HTTP transaction. It contains both a request message and a + * response message (which can be empty). + */ +struct http_txn { + struct http_msg rsp; /* HTTP response message */ + struct http_msg req; /* HTTP request message */ + unsigned int flags; /* transaction flags */ + enum http_meth_t meth; /* HTTP method */ + /* 1 unused byte here */ + short status; /* HTTP status sent to the client, negative if not set */ + short server_status; /* HTTP status received from the server, negative if not received */ + struct http_reply *http_reply; /* The HTTP reply to use as reply */ + struct buffer l7_buffer; /* To store the data, in case we have to retry */ + char cache_hash[20]; /* Store the cache hash */ + char cache_secondary_hash[HTTP_CACHE_SEC_KEY_LEN]; /* Optional cache secondary key. */ + char *uri; /* first line if log needed, NULL otherwise */ + char *cli_cookie; /* cookie presented by the client, in capture mode */ + char *srv_cookie; /* cookie presented by the server, in capture mode */ + int cookie_first_date; /* if non-zero, first date the expirable cookie was set/seen */ + int cookie_last_date; /* if non-zero, last date the expirable cookie was set/seen */ + + struct http_auth_data auth; /* HTTP auth data */ +}; + +#endif /* _HAPROXY_PROTO_HTTP_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/http_ana.h b/include/haproxy/http_ana.h new file mode 100644 index 0000000..2cc6516 --- /dev/null +++ b/include/haproxy/http_ana.h @@ -0,0 +1,91 @@ +/* + * include/haproxy/http_ana.h + * This file contains HTTP protocol definitions. + * + * Copyright (C) 2000-2011 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTO_HTTP_H +#define _HAPROXY_PROTO_HTTP_H + +#include <haproxy/api.h> +#include <haproxy/channel-t.h> +#include <haproxy/http_ana-t.h> +#include <haproxy/htx-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stream-t.h> + +extern struct pool_head *pool_head_uniqueid; +extern struct pool_head *pool_head_http_txn; + +int http_wait_for_request(struct stream *s, struct channel *req, int an_bit); +int http_process_req_common(struct stream *s, struct channel *req, int an_bit, struct proxy *px); +int http_process_request(struct stream *s, struct channel *req, int an_bit); +int http_process_tarpit(struct stream *s, struct channel *req, int an_bit); +int http_wait_for_request_body(struct stream *s, struct channel *req, int an_bit); +int http_wait_for_response(struct stream *s, struct channel *rep, int an_bit); +int http_process_res_common(struct stream *s, struct channel *rep, int an_bit, struct proxy *px); +int http_request_forward_body(struct stream *s, struct channel *req, int an_bit); +int http_response_forward_body(struct stream *s, struct channel *res, int an_bit); +int http_apply_redirect_rule(struct redirect_rule *rule, struct stream *s, struct http_txn *txn); +int http_eval_after_res_rules(struct stream *s); +int http_replace_hdrs(struct stream* s, struct htx *htx, struct ist name, const char *str, struct my_regex *re, int full); +int http_req_replace_stline(int action, const char *replace, int len, + struct proxy *px, struct stream *s); +int http_res_set_status(unsigned int status, struct ist reason, struct stream *s); +void http_check_request_for_cacheability(struct stream *s, struct channel *req); +void http_check_response_for_cacheability(struct stream *s, struct channel *res); +enum rule_result http_wait_for_msg_body(struct stream *s, struct channel *chn, unsigned int time, unsigned int bytes); +void http_perform_server_redirect(struct stream *s, struct stconn *sc); +void http_server_error(struct stream *s, struct stconn *sc, int err, int finst, struct http_reply *msg); +void http_reply_and_close(struct stream *s, short status, struct http_reply *msg); +void http_return_srv_error(struct stream *s, struct stconn *sc); +struct http_reply *http_error_message(struct stream *s); +int http_reply_to_htx(struct stream *s, struct htx *htx, struct http_reply *reply); +int http_reply_message(struct stream *s, struct http_reply *reply); +int http_forward_proxy_resp(struct stream *s, int final); + +struct http_txn *http_create_txn(struct stream *s); +void http_destroy_txn(struct stream *s); + +void http_set_term_flags(struct stream *s); + +/* for debugging, reports the HTTP/1 message state name (legacy version) */ +static inline const char *h1_msg_state_str(enum h1_state msg_state) +{ + switch (msg_state) { + case HTTP_MSG_RQBEFORE: return "MSG_RQBEFORE"; + case HTTP_MSG_RPBEFORE: return "MSG_RPBEFORE"; + case HTTP_MSG_BODY: return "MSG_BODY"; + case HTTP_MSG_DATA: return "MSG_DATA"; + case HTTP_MSG_ENDING: return "MSG_ENDING"; + case HTTP_MSG_DONE: return "MSG_DONE"; + case HTTP_MSG_CLOSING: return "MSG_CLOSING"; + case HTTP_MSG_CLOSED: return "MSG_CLOSED"; + case HTTP_MSG_TUNNEL: return "MSG_TUNNEL"; + default: return "MSG_??????"; + } +} + +#endif /* _HAPROXY_PROTO_HTTP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/http_client-t.h b/include/haproxy/http_client-t.h new file mode 100644 index 0000000..7ae0e61 --- /dev/null +++ b/include/haproxy/http_client-t.h @@ -0,0 +1,69 @@ +#ifndef _HAPROXY_HTTPCLIENT_T_H +#define _HAPROXY_HTTPCLIENT_T_H + +#include <haproxy/http-t.h> + +struct httpclient { + struct { + struct ist url; /* URL of the request */ + enum http_meth_t meth; /* method of the request */ + struct buffer buf; /* output buffer, HTX */ + } req; + struct { + struct ist vsn; + uint16_t status; + struct ist reason; + struct http_hdr *hdrs; /* headers */ + struct buffer buf; /* input buffer, raw HTTP */ + } res; + struct { + /* callbacks used to send the request, */ + void (*req_payload)(struct httpclient *hc); /* send a payload */ + + /* callbacks used to receive the response, if not set, the IO + * handler will consume the data without doing anything */ + void (*res_stline)(struct httpclient *hc); /* start line received */ + void (*res_headers)(struct httpclient *hc); /* headers received */ + void (*res_payload)(struct httpclient *hc); /* payload received */ + void (*res_end)(struct httpclient *hc); /* end of the response */ + } ops; + struct sockaddr_storage *dst; /* destination address */ + struct appctx *appctx; /* HTTPclient appctx */ + int timeout_server; /* server timeout in ms */ + void *caller; /* ptr of the caller */ + unsigned int flags; /* other flags */ + struct proxy *px; /* proxy for special cases */ + struct server *srv_raw; /* server for clear connections */ +#ifdef USE_OPENSSL + struct server *srv_ssl; /* server for SSL connections */ +#endif +}; + +/* Action (FA) to do */ +#define HTTPCLIENT_FA_STOP 0x00000001 /* stops the httpclient at the next IO handler call */ +#define HTTPCLIENT_FA_AUTOKILL 0x00000002 /* sets the applet to destroy the httpclient struct itself */ + +/* status (FS) */ +#define HTTPCLIENT_FS_STARTED 0x00010000 /* the httpclient was started */ +#define HTTPCLIENT_FS_ENDED 0x00020000 /* the httpclient is stopped */ + +/* States of the HTTP Client Appctx */ +enum { + HTTPCLIENT_S_REQ = 0, + HTTPCLIENT_S_REQ_BODY, + HTTPCLIENT_S_RES_STLINE, + HTTPCLIENT_S_RES_HDR, + HTTPCLIENT_S_RES_BODY, + HTTPCLIENT_S_RES_END, +}; + +#define HTTPCLIENT_USERAGENT "HAProxy" + +/* What kind of data we need to read */ +#define HC_F_RES_STLINE 0x01 +#define HC_F_RES_HDR 0x02 +#define HC_F_RES_BODY 0x04 +#define HC_F_RES_END 0x08 + + +#endif /* ! _HAPROXY_HTTCLIENT__T_H */ diff --git a/include/haproxy/http_client.h b/include/haproxy/http_client.h new file mode 100644 index 0000000..241ca24 --- /dev/null +++ b/include/haproxy/http_client.h @@ -0,0 +1,40 @@ +#ifndef _HAPROXY_HTTPCLIENT_H +#define _HAPROXY_HTTPCLIENT_H + +#include <haproxy/http_client-t.h> + +void httpclient_destroy(struct httpclient *hc); +void httpclient_stop_and_destroy(struct httpclient *hc); + +struct proxy *httpclient_create_proxy(const char *id); +struct httpclient *httpclient_new(void *caller, enum http_meth_t meth, struct ist url); +struct httpclient *httpclient_new_from_proxy(struct proxy *px, void *caller, enum http_meth_t meth, struct ist url); +int httpclient_set_proxy(struct httpclient *hc, struct proxy *px); + +struct appctx *httpclient_start(struct httpclient *hc); +int httpclient_set_dst(struct httpclient *hc, const char *dst); +void httpclient_set_timeout(struct httpclient *hc, int timeout); +int httpclient_res_xfer(struct httpclient *hc, struct buffer *dst); +int httpclient_req_gen(struct httpclient *hc, const struct ist url, enum http_meth_t meth, const struct http_hdr *hdrs, const struct ist payload); +int httpclient_req_xfer(struct httpclient *hc, struct ist src, int end); + +/* Return the amount of data available in the httpclient response buffer */ +static inline int httpclient_data(struct httpclient *hc) +{ + return b_data(&hc->res.buf); +} + +/* Return 1 if the httpclient ended and won't receive any new data */ +static inline int httpclient_ended(struct httpclient *hc) +{ + return !!(hc->flags & HTTPCLIENT_FS_ENDED); +} + +/* Return 1 if the httpclient started */ +static inline int httpclient_started(struct httpclient *hc) +{ + + return !!(hc->flags & HTTPCLIENT_FS_STARTED); +} + +#endif /* ! _HAPROXY_HTTCLIENT_H */ diff --git a/include/haproxy/http_ext-t.h b/include/haproxy/http_ext-t.h new file mode 100644 index 0000000..68eb047 --- /dev/null +++ b/include/haproxy/http_ext-t.h @@ -0,0 +1,149 @@ +/* + * include/haproxy/http_ext-t.h + * Version-agnostic and implementation-agnostic HTTP extensions definitions + * + * Copyright 2022 HAProxy Technologies + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTPEXT_T_H +#define _HAPROXY_HTTPEXT_T_H + +#include <arpa/inet.h> +#include <import/ist.h> +#include <haproxy/tools-t.h> + +enum forwarded_header_attribute_type { + FORWARDED_HEADER_UNK = 0, + FORWARDED_HEADER_OBFS = 1, + FORWARDED_HEADER_PORT = 2, + FORWARDED_HEADER_IP = 3, +}; + +struct forwarded_header_nodename { + union { + struct sockaddr_storage ip; + struct ist obfs; + }; + enum forwarded_header_attribute_type type; +}; + +struct forwarded_header_nodeport { + union { + uint16_t port; + struct ist obfs; + }; + enum forwarded_header_attribute_type type; +}; + +struct forwarded_header_node { + struct forwarded_header_nodename nodename; + struct forwarded_header_nodeport nodeport; + struct ist raw; +}; + +enum forwarded_header_proto { + FORWARDED_HEADER_HTTP = 1, + FORWARDED_HEADER_HTTPS = 2 +}; + +struct forwarded_header_ctx { + struct forwarded_header_node nfor; + struct forwarded_header_node nby; + struct ist host; + enum forwarded_header_proto proto; +}; + +enum http_ext_7239_forby_mode { + HTTP_7239_FORBY_ORIG = 1, + HTTP_7239_FORBY_SMP = 2 +}; +struct http_ext_7239_forby { + /* nn = nodename, np = nodeport */ + union { + char *nn_expr_s; + struct sample_expr *nn_expr; + }; + union { + char *np_expr_s; + struct sample_expr *np_expr; + }; + enum http_ext_7239_forby_mode nn_mode; + enum http_ext_7239_forby_mode np_mode; +}; + +enum http_ext_7239_host_mode { + HTTP_7239_HOST_ORIG = 1, + HTTP_7239_HOST_SMP = 2 +}; +struct http_ext_7239_host { + union { + char *expr_s; + struct sample_expr *expr; + }; + enum http_ext_7239_host_mode mode; +}; + +struct http_ext_7239 { + /* forwarded header parameters options */ + struct http_ext_7239_forby p_for; + struct http_ext_7239_forby p_by; + struct http_ext_7239_host p_host; + uint8_t p_proto; + /* config error hints, used only during configuration parsing */ + char *c_file; + int c_line; + int c_mode; /* 0: parsed, 1: compiled */ +}; + +enum forwarded_header_field { + FORWARDED_HEADER_FOR = 0x01, + FORWARDED_HEADER_BY = 0x02, + FORWARDED_HEADER_HOST = 0x04, + FORWARDED_HEADER_PROTO = 0x08, + FORWARDED_HEADER_ALL = FORWARDED_HEADER_FOR|FORWARDED_HEADER_BY|FORWARDED_HEADER_HOST|FORWARDED_HEADER_PROTO +}; + +enum http_ext_xff_mode { + HTTP_XFF_IFNONE = 0, /* set if not already set */ + HTTP_XFF_ALWAYS = 1 /* always set x-forwarded-for */ +}; +struct http_ext_xff { + struct ist hdr_name; /* header to use - default: "x-forwarded-for" */ + struct net_addr except_net; /* don't forward x-forward-for for this address. */ + uint8_t mode; +}; + +struct http_ext_xot { + struct ist hdr_name; /* header to use - default: "x-original-to" */ + struct net_addr except_net; /* don't forward x-original-to for this address. */ +}; + +/* http_ext options */ +struct http_ext { + /* forwarded header (RFC 7239) */ + struct http_ext_7239 *fwd; + /* x-forward-for: + * conditionally insert x-forwarded-for with client address + */ + struct http_ext_xff *xff; + /* x-original-to: + * insert x-original-to with destination address + */ + struct http_ext_xot *xot; +}; + +#endif /* !_HAPROXY_HTTPEXT_T_H */ diff --git a/include/haproxy/http_ext.h b/include/haproxy/http_ext.h new file mode 100644 index 0000000..53764a2 --- /dev/null +++ b/include/haproxy/http_ext.h @@ -0,0 +1,58 @@ +/* + * include/haproxy/http_ext.h + * Functions for Version-agnostic and implementation-agnostic HTTP extensions + * + * Copyright 2022 HAProxy Technologies + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTPEXT_H +#define _HAPROXY_HTTPEXT_H + +#include <haproxy/http_ext-t.h> +#include <haproxy/channel-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stream-t.h> + +int http_validate_7239_header(struct ist hdr, int required_steps, struct forwarded_header_ctx *ctx); + +int http_handle_7239_header(struct stream *s, struct channel *req); +int http_handle_xff_header(struct stream *s, struct channel *req); +int http_handle_xot_header(struct stream *s, struct channel *req); + +int proxy_http_parse_7239(char **args, int cur_arg, struct proxy *curproxy, const struct proxy *defpx, const char *file, int linenum); +int proxy_http_compile_7239(struct proxy *curproxy); +int proxy_http_parse_xff(char **args, int cur_arg, struct proxy *curproxy, const struct proxy *defpx, const char *file, int linenum); +int proxy_http_parse_xot(char **args, int cur_arg, struct proxy *curproxy, const struct proxy *defpx, const char *file, int linenum); + +int http_ext_7239_prepare(struct proxy *cur); +int http_ext_xff_prepare(struct proxy *cur); +int http_ext_xot_prepare(struct proxy *cur); + +void http_ext_7239_dup(const struct proxy *def, struct proxy *cpy); +void http_ext_xff_dup(const struct proxy *def, struct proxy *cpy); +void http_ext_xot_dup(const struct proxy *def, struct proxy *cpy); + +void http_ext_7239_clean(struct proxy *cur); +void http_ext_xff_clean(struct proxy *cur); +void http_ext_xot_clean(struct proxy *cur); + +int http_ext_prepare(struct proxy *cur); +void http_ext_dup(const struct proxy *def, struct proxy *cpy); +void http_ext_clean(struct proxy *cur); +void http_ext_softclean(struct proxy *cur); + +#endif /* !_HAPROXY_HTTPEXT_H */ diff --git a/include/haproxy/http_fetch.h b/include/haproxy/http_fetch.h new file mode 100644 index 0000000..7997629 --- /dev/null +++ b/include/haproxy/http_fetch.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/http_fetch.h + * This file contains the minimally required http sample fetch declarations. + * + * Copyright (C) 2000-2018 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_FETCH_H +#define _HAPROXY_HTTP_FETCH_H + +#include <haproxy/api.h> +#include <haproxy/arg-t.h> +#include <haproxy/channel-t.h> +#include <haproxy/check-t.h> +#include <haproxy/sample-t.h> + +struct htx *smp_prefetch_htx(struct sample *smp, struct channel *chn, struct check *check, int vol); +int val_hdr(struct arg *arg, char **err_msg); + +#endif /* _HAPROXY_HTTP_FETCH_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/http_htx-t.h b/include/haproxy/http_htx-t.h new file mode 100644 index 0000000..8051925 --- /dev/null +++ b/include/haproxy/http_htx-t.h @@ -0,0 +1,95 @@ +/* + * include/haproxy/http_htx-t.h + * This file defines everything related to HTTP manipulation using the internal + * representation. + * + * Copyright (C) 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_HTX_T_H +#define _HAPROXY_HTTP_HTX_T_H + +#include <import/ebistree.h> +#include <import/ist.h> + +#include <haproxy/buf-t.h> +#include <haproxy/http-t.h> +#include <haproxy/htx-t.h> + +/* Context used to find/remove an HTTP header. */ +struct http_hdr_ctx { + struct htx_blk *blk; + struct ist value; + uint16_t lws_before; + uint16_t lws_after; +}; + + +/* Structure used to build the header list of an HTTP reply */ +struct http_reply_hdr { + struct ist name; /* the header name */ + struct list value; /* the log-format string value */ + struct list list; /* header chained list */ +}; + +#define HTTP_REPLY_EMPTY 0x00 /* the reply has no payload */ +#define HTTP_REPLY_ERRMSG 0x01 /* the reply is an error message (may be NULL) */ +#define HTTP_REPLY_ERRFILES 0x02 /* the reply references an http-errors section */ +#define HTTP_REPLY_RAW 0x03 /* the reply use a raw payload */ +#define HTTP_REPLY_LOGFMT 0x04 /* the reply use a log-format payload */ +#define HTTP_REPLY_INDIRECT 0x05 /* the reply references another http-reply (may be NULL) */ + +/* Uses by HAProxy to generate internal responses */ +struct http_reply { + unsigned char type; /* HTTP_REPLY_* */ + int status; /* The response status code */ + char *ctype; /* The response content-type, may be NULL */ + struct list hdrs; /* A list of http_reply_hdr */ + union { + struct list fmt; /* A log-format string (type = HTTP_REPLY_LOGFMT) */ + struct buffer obj; /* A raw string (type = HTTP_REPLY_RAW) */ + struct buffer *errmsg; /* The error message to use as response (type = HTTP_REPLY_ERRMSG). + * may be NULL, if so rely on the proxy error messages */ + struct http_reply *reply; /* The HTTP reply to use as response (type = HTTP_REPLY_INDIRECT) */ + char *http_errors; /* The http-errors section to use (type = HTTP_REPLY_ERRFILES). + * Should be resolved during post-check */ + } body; + struct list list; /* next http_reply in the global list. + * Only used for replies defined in a proxy section */ +}; + +/* A custom HTTP error message load from a row file and converted in HTX. The + * node key is the file path. + */ +struct http_error_msg { + struct buffer msg; + struct ebpt_node node; +}; + +/* http-errors section and parameters. */ +struct http_errors { + char *id; /* unique identifier */ + struct { + char *file; /* file where the section appears */ + int line; /* line where the section appears */ + } conf; /* config information */ + + struct http_reply *replies[HTTP_ERR_SIZE]; /* HTTP replies for known errors */ + struct list list; /* http-errors list */ +}; + +#endif /* _HAPROXY_HTTP_HTX_T_H */ diff --git a/include/haproxy/http_htx.h b/include/haproxy/http_htx.h new file mode 100644 index 0000000..3d01a06 --- /dev/null +++ b/include/haproxy/http_htx.h @@ -0,0 +1,84 @@ +/* + * include/haproxy/http_htx-t.h + * This file defines function prototypes for HTTP manipulation using the + * internal representation. + * + * Copyright (C) 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_HTX_H +#define _HAPROXY_HTTP_HTX_H + +#include <import/ist.h> +#include <haproxy/buf-t.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/http_htx-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/regex-t.h> + +extern struct buffer http_err_chunks[HTTP_ERR_SIZE]; +extern struct http_reply http_err_replies[HTTP_ERR_SIZE]; +extern struct list http_errors_list; + +struct htx_sl *http_get_stline(const struct htx *htx); +size_t http_get_hdrs_size(struct htx *htx); +int http_find_header(const struct htx *htx, const struct ist name, struct http_hdr_ctx *ctx, int full); +int http_find_str_header(const struct htx *htx, const struct ist name, struct http_hdr_ctx *ctx, int full); +int http_find_pfx_header(const struct htx *htx, const struct ist prefix, struct http_hdr_ctx *ctx, int full); +int http_find_sfx_header(const struct htx *htx, const struct ist suffix, struct http_hdr_ctx *ctx, int full); +int http_find_sub_header(const struct htx *htx, const struct ist sub, struct http_hdr_ctx *ctx, int full); +int http_match_header(const struct htx *htx, const struct my_regex *re, struct http_hdr_ctx *ctx, int full); +int http_add_header(struct htx *htx, const struct ist n, const struct ist v); +int http_replace_stline(struct htx *htx, const struct ist p1, const struct ist p2, const struct ist p3); +int http_replace_req_meth(struct htx *htx, const struct ist meth); +int http_replace_req_uri(struct htx *htx, const struct ist uri); +int http_replace_req_path(struct htx *htx, const struct ist path, int with_qs); +int http_replace_req_query(struct htx *htx, const struct ist query); +int http_replace_res_status(struct htx *htx, const struct ist status, const struct ist reason); +int http_replace_res_reason(struct htx *htx, const struct ist reason); +int http_append_header_value(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist data); +int http_prepend_header_value(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist data); +int http_replace_header_value(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist data); +int http_replace_header(struct htx *htx, struct http_hdr_ctx *ctx, const struct ist name, const struct ist value); +int http_remove_header(struct htx *htx, struct http_hdr_ctx *ctx); +int http_update_authority(struct htx *htx, struct htx_sl *sl, const struct ist host); +int http_update_host(struct htx *htx, struct htx_sl *sl, const struct ist uri); + +unsigned int http_get_htx_hdr(const struct htx *htx, const struct ist hdr, + int occ, struct http_hdr_ctx *ctx, char **vptr, size_t *vlen); +unsigned int http_get_htx_fhdr(const struct htx *htx, const struct ist hdr, + int occ, struct http_hdr_ctx *ctx, char **vptr, size_t *vlen); +int http_str_to_htx(struct buffer *buf, struct ist raw, char **errmsg); + +void release_http_reply(struct http_reply *http_reply); +int http_check_http_reply(struct http_reply *reply, struct proxy*px, char **errmsg); +struct http_reply *http_parse_http_reply(const char **args, int *orig_arg, struct proxy *px, + int default_status, char **errmsg); + +int http_scheme_based_normalize(struct htx *htx); + +void http_cookie_register(struct http_hdr *list, int idx, int *first, int *last); +int http_cookie_merge(struct htx *htx, struct http_hdr *list, int first); + +struct buffer *http_load_errorfile(const char *file, char **errmsg); +struct buffer *http_load_errormsg(const char *key, const struct ist msg, char **errmsg); +struct buffer *http_parse_errorfile(int status, const char *file, char **errmsg); +struct buffer *http_parse_errorloc(int errloc, int status, const char *url, char **errmsg); +int proxy_dup_default_conf_errors(struct proxy *curpx, const struct proxy *defpx, char **errmsg); +void proxy_release_conf_errors(struct proxy *px); + +#endif /* _HAPROXY_HTTP_HTX_H */ diff --git a/include/haproxy/http_rules.h b/include/haproxy/http_rules.h new file mode 100644 index 0000000..740b546 --- /dev/null +++ b/include/haproxy/http_rules.h @@ -0,0 +1,56 @@ +/* + * include/haproxy/http_rules.h + * This file contains "http" rules definitions + * + * Copyright (C) 2000-2018 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTTP_RULES_H +#define _HAPROXY_HTTP_RULES_H + +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/list.h> +#include <haproxy/proxy-t.h> + +extern struct action_kw_list http_req_keywords; +extern struct action_kw_list http_res_keywords; +extern struct action_kw_list http_after_res_keywords; + +struct act_rule *parse_http_req_cond(const char **args, const char *file, int linenum, struct proxy *proxy); +struct act_rule *parse_http_res_cond(const char **args, const char *file, int linenum, struct proxy *proxy); +struct act_rule *parse_http_after_res_cond(const char **args, const char *file, int linenum, struct proxy *proxy); +void http_free_redirect_rule(struct redirect_rule *rdr); +struct redirect_rule *http_parse_redirect_rule(const char *file, int linenum, struct proxy *curproxy, + const char **args, char **errmsg, int use_fmt, int dir); + +void http_req_keywords_register(struct action_kw_list *kw_list); +void http_res_keywords_register(struct action_kw_list *kw_list); +void http_after_res_keywords_register(struct action_kw_list *kw_list); + +struct action_kw *action_http_req_custom(const char *kw); +struct action_kw *action_http_res_custom(const char *kw); +struct action_kw *action_http_after_res_custom(const char *kw); + +#endif /* _HAPROXY_HTTP_RULES_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/htx-t.h b/include/haproxy/htx-t.h new file mode 100644 index 0000000..2ea6bc8 --- /dev/null +++ b/include/haproxy/htx-t.h @@ -0,0 +1,277 @@ +/* + * include/haproxy/htx-t.h + * This file declares the types and constants used the internal HTTP messages + * + * Copyright (C) 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTX_T_H +#define _HAPROXY_HTX_T_H + +#include <haproxy/api.h> +#include <haproxy/http-t.h> +#include <haproxy/show_flags-t.h> + +/* + * The internal representation of an HTTP message, called HTX, is a structure + * with useful information on the message followed by a contiguous array + * containing parts of the message, called blocks. A block is composed of + * metadata (htx_blk) and the associated payload. Blocks' metadata are stored + * starting from the end of the array while their payload are stored at the + * beginning. Blocks' metadata are often simply called blocks. it is a misuse of + * language that's simplify explanations. + * + * + * +-----+---------------+------------------------------+--------------+ + * | HTX | PAYLOADS ==> | | <== HTX_BLKs | + * +-----+---------------+------------------------------+--------------+ + * ^ + * blocks[] (the beginning of the bocks array) + * + * + * The blocks part remains linear and sorted. You may think about it as an array + * with negative indexes. But, instead of using negative indexes, we use + * positive positions to identify a block. This position is then converted to a + * address relatively to the beginning of the blocks array. + * + * + * .....--+------------------------------+-----+-----+ + * | ... | BLK | BLK | + * .....--+------------------------------+-----+-----+ + * ^ ^ + * Addr of the block Addr of the block + * at the position 1 at the position 0 + * + * + * The payloads part is a raw space that may wrap. You never access to a block's + * payload directly. Instead you get a block to retrieve the address of its + * payload. When no more space left between blocks and payloads parts, the free + * space at the beginning, if any, is used. + * + * + * +----------- WRAPPING ------------------------+ + * | | + * V | + * +-----+-------------+---------------+---------------++--------------+ + * | HTX | PAYLOAD ==> | | PAYLOADS ==X || X== HTX_BLKs | + * +-----+-------------+---------------+---------------++--------------+ + * + * + * The blocks part, on its side, never wrap. If we have no space to allocate a + * new block and if there is a hole at the beginning of the blocks part (so at + * the end of the blocks array), we move back all blocks.x + * + * + * ...+--------------+----------+ blocks ...+----------+--------------+ + * | X== HTX_BLKS | | defrag | | <== HTX_BLKS | + * ...+--------------+----------+ =====> ...+----------+--------------+ + * + * + * At the end, if payload wrapping or blocks defragmentation is not enough, some + * free space may be get back with a full defragmentation. This way, the holes in + * the middle are not reusable but count in the available free space. The only + * way to reuse this lost space is to fully defragmenate the HTX message. + * + * - * - + * + * An HTX block is as well a header as a body part or a trailer. For all these + * types of block, a payload is attached to the block. It can also be a mark, + * like the end-of-headers or end-of-trailers. For these blocks, there is no + * payload but it count for a byte. It is important to not skip it when data are + * forwarded. Metadata of an HTX block are composed of 2 fields : + * + * - .info : It a 32 bits field containing the block's type on 4 bits + * followed by the payload length. See below for details. + * + * - .addr : The payload's address, if any, relatively to the beginning the + * array used to store the HTX message itself. + * + * htx_blk.info representation : + * + * 0b 0000 0000 0000 0000 0000 0000 0000 0000 + * ---- ------------------------ --------- + * type value (1 MB max) name length (header/trailer) + * ---------------------------------- + * data length (256 MB max) + * (body, method, path, version, status, reason) + * + * types : + * - 0000 = request start-line + * - 0001 = response start-line + * - 0010 = header + * - 0011 = pseudo-header ou "special" header + * - 0100 = end-of-headers + * - 0101 = data + * - 0110 = trailer + * - 0111 = end-of-trailers + * ... + * - 1111 = unused + * + */ + +/* HTX start-line flags. + * Please also update the se_show_flags() function below in case of changes. + */ +#define HTX_SL_F_NONE 0x00000000 +#define HTX_SL_F_IS_RESP 0x00000001 /* It is the response start-line (unset means the request one) */ +#define HTX_SL_F_XFER_LEN 0x00000002 /* The message xfer size can be dertermined */ +#define HTX_SL_F_XFER_ENC 0x00000004 /* The transfer-encoding header was found in message */ +#define HTX_SL_F_CLEN 0x00000008 /* The content-length header was found in message */ +#define HTX_SL_F_CHNK 0x00000010 /* The message payload is chunked */ +#define HTX_SL_F_VER_11 0x00000020 /* The message indicates version 1.1 or above */ +#define HTX_SL_F_BODYLESS 0x00000040 /* The message has no body (content-length = 0) */ +#define HTX_SL_F_HAS_SCHM 0x00000080 /* The scheme is explicitly specified */ +#define HTX_SL_F_SCHM_HTTP 0x00000100 /* The scheme HTTP should be used */ +#define HTX_SL_F_SCHM_HTTPS 0x00000200 /* The scheme HTTPS should be used */ +#define HTX_SL_F_HAS_AUTHORITY 0x00000400 /* The request authority is explicitly specified */ +#define HTX_SL_F_NORMALIZED_URI 0x00000800 /* The received URI is normalized (an implicit absolute-uri form) */ +#define HTX_SL_F_CONN_UPG 0x00001000 /* The message contains "connection: upgrade" header */ +#define HTX_SL_F_BODYLESS_RESP 0x00002000 /* The response to this message is bodyloess (only for reqyest) */ + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *hsl_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + + _(HTX_SL_F_IS_RESP, _(HTX_SL_F_XFER_LEN, _(HTX_SL_F_XFER_ENC, + _(HTX_SL_F_CLEN, _(HTX_SL_F_CHNK, _(HTX_SL_F_VER_11, + _(HTX_SL_F_BODYLESS, _(HTX_SL_F_HAS_SCHM, _(HTX_SL_F_SCHM_HTTP, + _(HTX_SL_F_SCHM_HTTPS, _(HTX_SL_F_HAS_AUTHORITY, + _(HTX_SL_F_NORMALIZED_URI, _(HTX_SL_F_CONN_UPG))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* Overhead induced by HTX on buffers during transfers. In addition to the size + * of the HTX structure itself, and meta data for one block, another block is + * accounted to favored zero-copy xfer. + */ +#define HTX_BUF_OVERHEAD (sizeof(struct htx) + 2 * sizeof(struct htx_blk)) + +/* HTX flags. + * Please also update the htx_show_flags() function below in case of changes. + */ +#define HTX_FL_NONE 0x00000000 +#define HTX_FL_PARSING_ERROR 0x00000001 /* Set when a parsing error occurred */ +#define HTX_FL_PROCESSING_ERROR 0x00000002 /* Set when a processing error occurred */ +#define HTX_FL_FRAGMENTED 0x00000004 /* Set when the HTX buffer is fragmented */ +#define HTX_FL_PROXY_RESP 0x00000008 /* Set when the response was generated by HAProxy */ +#define HTX_FL_EOM 0x00000010 /* Set when end-of-message is reached from the HTTP point of view + * (at worst, on the EOM block is missing) + */ +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *htx_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(HTX_FL_PARSING_ERROR, _(HTX_FL_PROCESSING_ERROR, + _(HTX_FL_FRAGMENTED, _(HTX_FL_PROXY_RESP, _(HTX_FL_EOM))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + + +/* HTX block's type (max 15). */ +enum htx_blk_type { + HTX_BLK_REQ_SL = 0, /* Request start-line */ + HTX_BLK_RES_SL = 1, /* Response start-line */ + HTX_BLK_HDR = 2, /* header name/value block */ + HTX_BLK_EOH = 3, /* end-of-headers block */ + HTX_BLK_DATA = 4, /* data block */ + HTX_BLK_TLR = 5, /* trailer name/value block */ + HTX_BLK_EOT = 6, /* end-of-trailers block */ + /* 7 .. 14 unused */ + HTX_BLK_UNUSED = 15, /* unused/removed block */ +}; + +/* One HTX block descriptor */ +struct htx_blk { + uint32_t addr; /* relative storage address of the block's payload */ + uint32_t info; /* information about the block (type, length) */ +}; + +/* Composite return value used by some HTX functions */ +struct htx_ret { + int32_t ret; /* A numerical value */ + struct htx_blk *blk; /* An HTX block */ +}; + +/* HTX start-line */ +struct htx_sl { + unsigned int flags; /* HTX_SL_F_* */ + union { + struct { + enum http_meth_t meth; /* method */ + } req; + struct { + uint16_t status; /* status code */ + } res; + } info; + + /* XXX 2 bytes unused */ + + unsigned int len[3]; /* length of different parts of the start-line */ + char l[VAR_ARRAY]; +}; + +/* Internal representation of an HTTP message */ +struct htx { + uint32_t size; /* the array size, in bytes, used to store the HTTP message itself */ + uint32_t data; /* the data size, in bytes. To known to total size used by all allocated + * blocks (blocks and their contents), you need to add size used by blocks, + * i.e. [ used * sizeof(struct htx_blk *) ] */ + + int32_t tail; /* newest inserted block. -1 if the HTX message is empty */ + int32_t head; /* oldest inserted block. -1 if the HTX message is empty */ + int32_t first; /* position of the first block to (re)start the analyse. -1 if unset. */ + + uint32_t tail_addr; /* start address of the free space in front of the the blocks table */ + uint32_t head_addr; /* start address of the free space at the beginning */ + uint32_t end_addr; /* end address of the free space at the beginning */ + + uint64_t extra; /* known bytes amount remaining to receive */ + uint32_t flags; /* HTX_FL_* */ + + /* XXX 4 bytes unused */ + + /* Blocks representing the HTTP message itself */ + char blocks[VAR_ARRAY] __attribute__((aligned(8))); +}; + +#endif /* _HAPROXY_HTX_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/htx.h b/include/haproxy/htx.h new file mode 100644 index 0000000..c991c81 --- /dev/null +++ b/include/haproxy/htx.h @@ -0,0 +1,885 @@ +/* + * include/haproxy/htx.h + * This file defines everything related to the internal HTTP messages. + * + * Copyright (C) 2018 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_HTX_H +#define _HAPROXY_HTX_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/http-hdr-t.h> +#include <haproxy/http-t.h> +#include <haproxy/htx-t.h> + +/* ->extra field value when the payload length is unknown (non-chunked message + * with no "Content-length" header) + */ +#define HTX_UNKOWN_PAYLOAD_LENGTH ULLONG_MAX + +extern struct htx htx_empty; + +struct htx_blk *htx_defrag(struct htx *htx, struct htx_blk *blk, uint32_t info); +struct htx_blk *htx_add_blk(struct htx *htx, enum htx_blk_type type, uint32_t blksz); +struct htx_blk *htx_remove_blk(struct htx *htx, struct htx_blk *blk); +struct htx_ret htx_find_offset(struct htx *htx, uint32_t offset); +void htx_truncate(struct htx *htx, uint32_t offset); +struct htx_ret htx_drain(struct htx *htx, uint32_t max); + +struct htx_blk *htx_replace_blk_value(struct htx *htx, struct htx_blk *blk, + const struct ist old, const struct ist new); +struct htx_ret htx_xfer_blks(struct htx *dst, struct htx *src, uint32_t count, + enum htx_blk_type mark); + +struct htx_sl *htx_replace_stline(struct htx *htx, struct htx_blk *blk, const struct ist p1, + const struct ist p2, const struct ist p3); + +struct htx_blk *htx_replace_header(struct htx *htx, struct htx_blk *blk, + const struct ist name, const struct ist value); + +struct htx_ret htx_reserve_max_data(struct htx *htx); +struct htx_blk *htx_add_data_atonce(struct htx *htx, struct ist data); +size_t htx_add_data(struct htx *htx, const struct ist data); +struct htx_blk *htx_add_last_data(struct htx *htx, struct ist data); +void htx_move_blk_before(struct htx *htx, struct htx_blk **blk, struct htx_blk **ref); +int htx_append_msg(struct htx *dst, const struct htx *src); + +/* Functions and macros to get parts of the start-line or length of these + * parts. Request and response start-lines are both composed of 3 parts. + */ +#define HTX_SL_LEN(sl) ((sl)->len[0] + (sl)->len[1] + (sl)->len[2]) + +#define HTX_SL_P1_LEN(sl) ((sl)->len[0]) +#define HTX_SL_P2_LEN(sl) ((sl)->len[1]) +#define HTX_SL_P3_LEN(sl) ((sl)->len[2]) +#define HTX_SL_P1_PTR(sl) ((sl)->l) +#define HTX_SL_P2_PTR(sl) (HTX_SL_P1_PTR(sl) + HTX_SL_P1_LEN(sl)) +#define HTX_SL_P3_PTR(sl) (HTX_SL_P2_PTR(sl) + HTX_SL_P2_LEN(sl)) + +#define HTX_SL_REQ_MLEN(sl) HTX_SL_P1_LEN(sl) +#define HTX_SL_REQ_ULEN(sl) HTX_SL_P2_LEN(sl) +#define HTX_SL_REQ_VLEN(sl) HTX_SL_P3_LEN(sl) +#define HTX_SL_REQ_MPTR(sl) HTX_SL_P1_PTR(sl) +#define HTX_SL_REQ_UPTR(sl) HTX_SL_P2_PTR(sl) +#define HTX_SL_REQ_VPTR(sl) HTX_SL_P3_PTR(sl) + +#define HTX_SL_RES_VLEN(sl) HTX_SL_P1_LEN(sl) +#define HTX_SL_RES_CLEN(sl) HTX_SL_P2_LEN(sl) +#define HTX_SL_RES_RLEN(sl) HTX_SL_P3_LEN(sl) +#define HTX_SL_RES_VPTR(sl) HTX_SL_P1_PTR(sl) +#define HTX_SL_RES_CPTR(sl) HTX_SL_P2_PTR(sl) +#define HTX_SL_RES_RPTR(sl) HTX_SL_P3_PTR(sl) + +static inline struct ist htx_sl_p1(const struct htx_sl *sl) +{ + return ist2(HTX_SL_P1_PTR(sl), HTX_SL_P1_LEN(sl)); +} + +static inline struct ist htx_sl_p2(const struct htx_sl *sl) +{ + return ist2(HTX_SL_P2_PTR(sl), HTX_SL_P2_LEN(sl)); +} + +static inline struct ist htx_sl_p3(const struct htx_sl *sl) +{ + return ist2(HTX_SL_P3_PTR(sl), HTX_SL_P3_LEN(sl)); +} + +static inline struct ist htx_sl_req_meth(const struct htx_sl *sl) +{ + return htx_sl_p1(sl); +} + +static inline struct ist htx_sl_req_uri(const struct htx_sl *sl) +{ + return htx_sl_p2(sl); +} + +static inline struct ist htx_sl_req_vsn(const struct htx_sl *sl) +{ + return htx_sl_p3(sl); +} + + +static inline struct ist htx_sl_res_vsn(const struct htx_sl *sl) +{ + return htx_sl_p1(sl); +} + +static inline struct ist htx_sl_res_code(const struct htx_sl *sl) +{ + return htx_sl_p2(sl); +} + +static inline struct ist htx_sl_res_reason(const struct htx_sl *sl) +{ + return htx_sl_p3(sl); +} + +/* Converts a position to the corresponding relative address */ +static inline uint32_t htx_pos_to_addr(const struct htx *htx, uint32_t pos) +{ + return htx->size - (pos + 1) * sizeof(struct htx_blk); +} + +/* Returns the position of the block <blk>. It is the caller responsibility to + * be sure <blk> is part of <htx>. */ +static inline uint32_t htx_get_blk_pos(const struct htx *htx, const struct htx_blk *blk) +{ + return ((htx->blocks + htx->size - (char *)blk) / sizeof(struct htx_blk) - 1); +} + +/* Returns the block at the position <pos>. It is the caller responsibility to + * be sure the block at the position <pos> exists. */ +static inline struct htx_blk *htx_get_blk(const struct htx *htx, uint32_t pos) +{ + return (struct htx_blk *)(htx->blocks + htx_pos_to_addr(htx, pos)); +} + +/* Returns the type of the block <blk> */ +static inline enum htx_blk_type htx_get_blk_type(const struct htx_blk *blk) +{ + return (blk->info >> 28); +} + +/* Returns the size of the block <blk>, depending of its type */ +static inline uint32_t htx_get_blksz(const struct htx_blk *blk) +{ + enum htx_blk_type type = htx_get_blk_type(blk); + + switch (type) { + case HTX_BLK_HDR: + case HTX_BLK_TLR: + /* name.length + value.length */ + return ((blk->info & 0xff) + ((blk->info >> 8) & 0xfffff)); + default: + /* value.length */ + return (blk->info & 0xfffffff); + } +} + +/* Returns the position of the oldest entry (head). It returns a signed 32-bits + * integer, -1 means the HTX message is empty. + */ +static inline int32_t htx_get_head(const struct htx *htx) +{ + return htx->head; +} + +/* Returns the oldest HTX block (head) if the HTX message is not + * empty. Otherwise it returns NULL. + */ +static inline struct htx_blk *htx_get_head_blk(const struct htx *htx) +{ + int32_t head = htx_get_head(htx); + + return ((head == -1) ? NULL : htx_get_blk(htx, head)); +} + +/* same as above but unchecked, may only be used when certain that a block + * exists. + */ +static inline struct htx_blk *__htx_get_head_blk(const struct htx *htx) +{ + int32_t head = htx_get_head(htx); + + return htx_get_blk(htx, head); +} + +/* Returns the type of the oldest HTX block (head) if the HTX message is not + * empty. Otherwise it returns HTX_BLK_UNUSED. + */ +static inline enum htx_blk_type htx_get_head_type(const struct htx *htx) +{ + struct htx_blk *blk = htx_get_head_blk(htx); + + return (blk ? htx_get_blk_type(blk) : HTX_BLK_UNUSED); +} + +/* Returns the position of the newest entry (tail). It returns a signed 32-bits + * integer, -1 means the HTX message is empty. + */ +static inline int32_t htx_get_tail(const struct htx *htx) +{ + return htx->tail; +} + +/* Returns the newest HTX block (tail) if the HTX message is not + * empty. Otherwise it returns NULL. + */ +static inline struct htx_blk *htx_get_tail_blk(const struct htx *htx) +{ + int32_t tail = htx_get_tail(htx); + + return ((tail == -1) ? NULL : htx_get_blk(htx, tail)); +} + +/* Returns the type of the newest HTX block (tail) if the HTX message is not + * empty. Otherwise it returns HTX_BLK_UNUSED. + */ +static inline enum htx_blk_type htx_get_tail_type(const struct htx *htx) +{ + struct htx_blk *blk = htx_get_tail_blk(htx); + + return (blk ? htx_get_blk_type(blk) : HTX_BLK_UNUSED); +} + +/* Returns the position of the first block in the HTX message <htx>. -1 means + * the first block is unset or the HTS is empty. + */ +static inline int32_t htx_get_first(const struct htx *htx) +{ + return htx->first; +} + +/* Returns the first HTX block in the HTX message <htx>. If unset or if <htx> is + * empty, NULL returned. + */ +static inline struct htx_blk *htx_get_first_blk(const struct htx *htx) +{ + int32_t pos; + + pos = htx_get_first(htx); + return ((pos == -1) ? NULL : htx_get_blk(htx, pos)); +} + +/* Returns the type of the first block in the HTX message <htx>. If unset or if + * <htx> is empty, HTX_BLK_UNUSED is returned. + */ +static inline enum htx_blk_type htx_get_first_type(const struct htx *htx) +{ + struct htx_blk *blk = htx_get_first_blk(htx); + + return (blk ? htx_get_blk_type(blk) : HTX_BLK_UNUSED); +} + +/* Returns the position of block immediately before the one pointed by <pos>. If + * the message is empty or if <pos> is the position of the head, -1 returned. + */ +static inline int32_t htx_get_prev(const struct htx *htx, uint32_t pos) +{ + if (htx->head == -1 || pos == htx->head) + return -1; + return (pos - 1); +} + +/* Returns the HTX block before <blk> in the HTX message <htx>. If <blk> is the + * head, NULL returned. + */ +static inline struct htx_blk *htx_get_prev_blk(const struct htx *htx, + const struct htx_blk *blk) +{ + int32_t pos; + + pos = htx_get_prev(htx, htx_get_blk_pos(htx, blk)); + return ((pos == -1) ? NULL : htx_get_blk(htx, pos)); +} + +/* Returns the position of block immediately after the one pointed by <pos>. If + * the message is empty or if <pos> is the position of the tail, -1 returned. + */ +static inline int32_t htx_get_next(const struct htx *htx, uint32_t pos) +{ + if (htx->tail == -1 || pos == htx->tail) + return -1; + return (pos + 1); + +} + +/* Returns the HTX block after <blk> in the HTX message <htx>. If <blk> is the + * tail, NULL returned. + */ +static inline struct htx_blk *htx_get_next_blk(const struct htx *htx, + const struct htx_blk *blk) +{ + int32_t pos; + + pos = htx_get_next(htx, htx_get_blk_pos(htx, blk)); + return ((pos == -1) ? NULL : htx_get_blk(htx, pos)); +} + +/* Returns 1 if <blk> is the block is the only one inside the HTX message <htx>, + * excluding all unused blocks. Otherwise, it returns 0. If 1 is returned, this + * means that there is only <blk> and eventually some unused ones in <htx>. + */ +static inline int htx_is_unique_blk(const struct htx *htx, + const struct htx_blk *blk) +{ + return (htx_get_blksz(blk) == htx->data); +} + +/* Changes the size of the value. It is the caller responsibility to change the + * value itself, make sure there is enough space and update allocated + * value. This function updates the HTX message accordingly. + */ +static inline void htx_change_blk_value_len(struct htx *htx, struct htx_blk *blk, uint32_t newlen) +{ + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t oldlen, sz; + int32_t delta; + + sz = htx_get_blksz(blk); + switch (type) { + case HTX_BLK_HDR: + case HTX_BLK_TLR: + oldlen = (blk->info >> 8) & 0xfffff; + blk->info = (type << 28) + (newlen << 8) + (blk->info & 0xff); + break; + default: + oldlen = blk->info & 0xfffffff; + blk->info = (type << 28) + newlen; + break; + } + + /* Update HTTP message */ + delta = (newlen - oldlen); + htx->data += delta; + if (blk->addr+sz == htx->tail_addr) + htx->tail_addr += delta; + else if (blk->addr+sz == htx->head_addr) + htx->head_addr += delta; +} + +/* Changes the size of the value. It is the caller responsibility to change the + * value itself, make sure there is enough space and update allocated + * value. Unlike the function htx_change_blk_value_len(), this one does not + * update the HTX message. So it should be used with caution. + */ +static inline void htx_set_blk_value_len(struct htx_blk *blk, uint32_t vlen) +{ + enum htx_blk_type type = htx_get_blk_type(blk); + + switch (type) { + case HTX_BLK_HDR: + case HTX_BLK_TLR: + blk->info = (type << 28) + (vlen << 8) + (blk->info & 0xff); + break; + case HTX_BLK_REQ_SL: + case HTX_BLK_RES_SL: + case HTX_BLK_DATA: + blk->info = (type << 28) + vlen; + break; + default: + /* Unexpected case */ + break; + } +} + +/* Returns the data pointer of the block <blk> */ +static inline void *htx_get_blk_ptr(const struct htx *htx, const struct htx_blk *blk) +{ + return ((void *)htx->blocks + blk->addr); +} + +/* Returns the name of the block <blk>, only if it is a header or a + * trailer. Otherwise it returns an empty string. + */ +static inline struct ist htx_get_blk_name(const struct htx *htx, const struct htx_blk *blk) +{ + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist ret; + + switch (type) { + case HTX_BLK_HDR: + case HTX_BLK_TLR: + ret = ist2(htx_get_blk_ptr(htx, blk), + blk->info & 0xff); + break; + + default: + return ist(""); + } + return ret; +} + + +/* Returns the value of the block <blk>, depending on its type. If there is no + * value (for end-of blocks), an empty one is returned. + */ +static inline struct ist htx_get_blk_value(const struct htx *htx, const struct htx_blk *blk) +{ + enum htx_blk_type type = htx_get_blk_type(blk); + struct ist ret; + + switch (type) { + case HTX_BLK_HDR: + case HTX_BLK_TLR: + ret = ist2(htx_get_blk_ptr(htx, blk) + (blk->info & 0xff), + (blk->info >> 8) & 0xfffff); + break; + + case HTX_BLK_REQ_SL: + case HTX_BLK_RES_SL: + case HTX_BLK_DATA: + ret = ist2(htx_get_blk_ptr(htx, blk), + blk->info & 0xfffffff); + break; + + default: + return ist(""); + } + return ret; +} + +/* Add a new start-line. It returns it on success, otherwise it returns NULL. It + * is the caller responsibility to set sl->info, if necessary. + */ +static inline struct htx_sl *htx_add_stline(struct htx *htx, enum htx_blk_type type, unsigned int flags, + const struct ist p1, const struct ist p2, const struct ist p3) +{ + struct htx_blk *blk; + struct htx_sl *sl; + uint32_t size; + + if (type != HTX_BLK_REQ_SL && type != HTX_BLK_RES_SL) + return NULL; + + size = sizeof(*sl) + p1.len + p2.len + p3.len; + + blk = htx_add_blk(htx, type, size); + if (!blk) + return NULL; + blk->info += size; + + sl = htx_get_blk_ptr(htx, blk); + sl->flags = flags; + + HTX_SL_P1_LEN(sl) = p1.len; + HTX_SL_P2_LEN(sl) = p2.len; + HTX_SL_P3_LEN(sl) = p3.len; + + memcpy(HTX_SL_P1_PTR(sl), p1.ptr, p1.len); + memcpy(HTX_SL_P2_PTR(sl), p2.ptr, p2.len); + memcpy(HTX_SL_P3_PTR(sl), p3.ptr, p3.len); + + return sl; +} + +/* Adds an HTX block of type HDR in <htx>. It returns the new block on + * success. Otherwise, it returns NULL. The header name is always lower cased. + */ +static inline struct htx_blk *htx_add_header(struct htx *htx, const struct ist name, + const struct ist value) +{ + struct htx_blk *blk; + + if (name.len > 255 || value.len > 1048575) + return NULL; + + blk = htx_add_blk(htx, HTX_BLK_HDR, name.len + value.len); + if (!blk) + return NULL; + + blk->info += (value.len << 8) + name.len; + ist2bin_lc(htx_get_blk_ptr(htx, blk), name); + memcpy(htx_get_blk_ptr(htx, blk) + name.len, value.ptr, value.len); + return blk; +} + +/* Adds an HTX block of type TLR in <htx>. It returns the new block on + * success. Otherwise, it returns NULL. The trailer name is always lower cased. + */ +static inline struct htx_blk *htx_add_trailer(struct htx *htx, const struct ist name, + const struct ist value) +{ + struct htx_blk *blk; + + if (name.len > 255 || value.len > 1048575) + return NULL; + + blk = htx_add_blk(htx, HTX_BLK_TLR, name.len + value.len); + if (!blk) + return NULL; + + blk->info += (value.len << 8) + name.len; + ist2bin_lc(htx_get_blk_ptr(htx, blk), name); + memcpy(htx_get_blk_ptr(htx, blk) + name.len, value.ptr, value.len); + return blk; +} + +/* Adds an HTX block of type EOH or EOT in <htx>. It returns the new block on + * success. Otherwise, it returns NULL. + */ +static inline struct htx_blk *htx_add_endof(struct htx *htx, enum htx_blk_type type) +{ + struct htx_blk *blk; + + blk = htx_add_blk(htx, type, 1); + if (!blk) + return NULL; + + blk->info += 1; + return blk; +} + +/* Add all headers from the list <hdrs> into the HTX message <htx>, followed by + * the EOH. On success, it returns the last block inserted (the EOH), otherwise + * NULL is returned. + * + * Headers with a NULL value (.ptr == NULL) are ignored but not those with empty + * value (.len == 0 but .ptr != NULL) + */ +static inline struct htx_blk *htx_add_all_headers(struct htx *htx, const struct http_hdr *hdrs) +{ + int i; + + for (i = 0; hdrs[i].n.len; i++) { + /* Don't check the value length because a header value may be empty */ + if (isttest(hdrs[i].v) == 0) + continue; + if (!htx_add_header(htx, hdrs[i].n, hdrs[i].v)) + return NULL; + } + return htx_add_endof(htx, HTX_BLK_EOH); +} + +/* Add all trailers from the list <hdrs> into the HTX message <htx>, followed by + * the EOT. On success, it returns the last block inserted (the EOT), otherwise + * NULL is returned. + * + * Trailers with a NULL value (.ptr == NULL) are ignored but not those with + * empty value (.len == 0 but .ptr != NULL) + */ +static inline struct htx_blk *htx_add_all_trailers(struct htx *htx, const struct http_hdr *hdrs) +{ + int i; + + for (i = 0; hdrs[i].n.len; i++) { + /* Don't check the value length because a header value may be empty */ + if (isttest(hdrs[i].v) == 0) + continue; + if (!htx_add_trailer(htx, hdrs[i].n, hdrs[i].v)) + return NULL; + } + return htx_add_endof(htx, HTX_BLK_EOT); +} + +/* Removes <n> bytes from the beginning of DATA block <blk>. The block's start + * address and its length are adjusted, and the htx's total data count is + * updated. This is used to mark that part of some data were transferred + * from a DATA block without removing this DATA block. No sanity check is + * performed, the caller is responsible for doing this exclusively on DATA + * blocks, and never removing more than the block's size. + */ +static inline void htx_cut_data_blk(struct htx *htx, struct htx_blk *blk, uint32_t n) +{ + if (blk->addr == htx->end_addr) + htx->end_addr += n; + blk->addr += n; + blk->info -= n; + htx->data -= n; +} + +/* Returns the space used by metadata in <htx>. */ +static inline uint32_t htx_meta_space(const struct htx *htx) +{ + if (htx->tail == -1) + return 0; + + return ((htx->tail + 1 - htx->head) * sizeof(struct htx_blk)); +} + +/* Returns the space used (payload + metadata) in <htx> */ +static inline uint32_t htx_used_space(const struct htx *htx) +{ + return (htx->data + htx_meta_space(htx)); +} + +/* Returns the free space in <htx> */ +static inline uint32_t htx_free_space(const struct htx *htx) +{ + return (htx->size - htx_used_space(htx)); +} + +/* Returns the maximum size available to store some data in <htx> if a new block + * is reserved. + */ +static inline uint32_t htx_free_data_space(const struct htx *htx) +{ + uint32_t free = htx_free_space(htx); + + if (free < sizeof(struct htx_blk)) + return 0; + return (free - sizeof(struct htx_blk)); +} + +/* Returns non-zero only if the HTX message free space wraps */ +static inline int htx_space_wraps(const struct htx *htx) +{ + uint32_t headroom, tailroom; + + headroom = (htx->end_addr - htx->head_addr); + tailroom = (htx_pos_to_addr(htx, htx->tail) - htx->tail_addr); + + return (headroom && tailroom); +} + +/* Returns the maximum size for a block, not exceeding <max> bytes. <max> may be + * set to -1 to have no limit. + */ +static inline uint32_t htx_get_max_blksz(const struct htx *htx, int32_t max) +{ + uint32_t free = htx_free_space(htx); + + if (max != -1 && free > max) + free = max; + if (free < sizeof(struct htx_blk)) + return 0; + return (free - sizeof(struct htx_blk)); +} + +/* Returns 1 if the message has less than 1/4 of its capacity free, otherwise 0 */ +static inline int htx_almost_full(const struct htx *htx) +{ + if (!htx->size || htx_free_space(htx) < htx->size / 4) + return 1; + return 0; +} + +/* Resets an HTX message */ +static inline void htx_reset(struct htx *htx) +{ + htx->tail = htx->head = htx->first = -1; + htx->data = 0; + htx->tail_addr = htx->head_addr = htx->end_addr = 0; + htx->extra = 0; + htx->flags = HTX_FL_NONE; +} + +/* Returns the available room for raw data in buffer <buf> once HTX overhead is + * taken into account (one HTX header and two blocks). The purpose is to figure + * the optimal fill length to avoid copies. + */ +static inline size_t buf_room_for_htx_data(const struct buffer *buf) +{ + size_t room; + + room = b_room(buf); + if (room <= HTX_BUF_OVERHEAD) + room = 0; + else + room -= HTX_BUF_OVERHEAD; + + return room; +} + + +/* Returns an HTX message using the buffer <buf>. Unlike htx_from_buf(), this + * function does not update the buffer. So if the HTX message is updated, the + * caller must call htx_to_buf() to be sure to also update the underlying buffer + * accordingly. Note that it always returns a valid pointer, either to an + * initialized buffer or to the empty buffer. This function must always be + * called with a buffer containing an HTX message (or an empty buffer). + */ +static inline struct htx *htxbuf(const struct buffer *buf) +{ + struct htx *htx; + + if (b_is_null(buf)) + return &htx_empty; + htx = ((struct htx *)(buf->area)); + if (!b_data(buf)) { + htx->size = buf->size - sizeof(*htx); + htx_reset(htx); + } + return htx; +} + +/* Returns an HTX message using the buffer <buf>. <buf> is updated to appear as + * full. It should be used when you want to add something into the HTX message, + * so the call to htx_to_buf() may be skipped. But, it is the caller + * responsibility to call htx_to_buf() to reset <buf> if it is relevant. The + * returned pointer is always valid. This function must always be called with a + * buffer containing an HTX message (or an empty buffer). + * + * The caller can call htxbuf() function to avoid any update of the buffer. + */ +static inline struct htx *htx_from_buf(struct buffer *buf) +{ + struct htx *htx = htxbuf(buf); + + b_set_data(buf, b_size(buf)); + return htx; +} + +/* Update <buf> accordingly to the HTX message <htx> */ +static inline void htx_to_buf(struct htx *htx, struct buffer *buf) +{ + if ((htx->head == -1) && + !(htx->flags & (HTX_FL_PARSING_ERROR|HTX_FL_PROCESSING_ERROR))) { + htx_reset(htx); + b_set_data(buf, 0); + } + else + b_set_data(buf, b_size(buf)); +} + +/* Returns 1 if the message is empty, otherwise it returns 0. Note that it is + * illegal to call this with htx == NULL. + */ +static inline int htx_is_empty(const struct htx *htx) +{ + return (htx->head == -1); +} + +/* Returns 1 if the message is not empty, otherwise it returns 0. Note that it + * is illegal to call this with htx == NULL. + */ +static inline int htx_is_not_empty(const struct htx *htx) +{ + return (htx->head != -1); +} + +/* Returns 1 if no more data are expected for the message <htx>. Otherwise it + * returns 0. Note that it is illegal to call this with htx == NULL. This + * function relies on the HTX_FL_EOM flags. It means tunneled data are not + * considered here. + */ +static inline int htx_expect_more(const struct htx *htx) +{ + return !(htx->flags & HTX_FL_EOM); +} + +/* Set EOM flag in <htx>. This function is useful if the HTX message is empty. + * In this case, an EOT block is appended first to ensure the EOM will be + * forwarded as expected. This is a workaround as it is not possibly currently + * to push an empty HTX DATA block. + * + * Returns 1 on success else 0. + */ +static inline int htx_set_eom(struct htx *htx) +{ + if (htx_is_empty(htx)) { + if (!htx_add_endof(htx, HTX_BLK_EOT)) + return 0; + } + + htx->flags |= HTX_FL_EOM; + return 1; +} + +/* Copy an HTX message stored in the buffer <msg> to <htx>. We take care to + * not overwrite existing data. All the message is copied or nothing. It returns + * 1 on success and 0 on error. + */ +static inline int htx_copy_msg(struct htx *htx, const struct buffer *msg) +{ + /* The destination HTX message is allocated and empty, we can do a raw copy */ + if (htx_is_empty(htx) && htx_free_space(htx)) { + memcpy(htx, msg->area, msg->size); + return 1; + } + + /* Otherwise, we need to append the HTX message */ + return htx_append_msg(htx, htxbuf(msg)); +} + +/* Remove all blocks except headers. Trailers will also be removed too. */ +static inline void htx_skip_msg_payload(struct htx *htx) +{ + struct htx_blk *blk = htx_get_first_blk(htx); + + while (blk) { + enum htx_blk_type type = htx_get_blk_type(blk); + + blk = ((type > HTX_BLK_EOH) + ? htx_remove_blk(htx, blk) + : htx_get_next_blk(htx, blk)); + } +} + +/* Returns the number of used blocks in the HTX message <htx>. Note that it is + * illegal to call this function with htx == NULL. Note also blocks of type + * HTX_BLK_UNUSED are part of used blocks. + */ +static inline int htx_nbblks(const struct htx *htx) +{ + return ((htx->head != -1) ? (htx->tail + 1 - htx->head) : 0); +} +/* For debugging purpose */ +static inline const char *htx_blk_type_str(enum htx_blk_type type) +{ + switch (type) { + case HTX_BLK_REQ_SL: return "HTX_BLK_REQ_SL"; + case HTX_BLK_RES_SL: return "HTX_BLK_RES_SL"; + case HTX_BLK_HDR: return "HTX_BLK_HDR"; + case HTX_BLK_EOH: return "HTX_BLK_EOH"; + case HTX_BLK_DATA: return "HTX_BLK_DATA"; + case HTX_BLK_TLR: return "HTX_BLK_TLR"; + case HTX_BLK_EOT: return "HTX_BLK_EOT"; + case HTX_BLK_UNUSED: return "HTX_BLK_UNUSED"; + default: return "HTX_BLK_???"; + }; +} + +/* For debugging purpose */ +static inline void htx_dump(struct buffer *chunk, const struct htx *htx, int full) +{ + int32_t pos; + + chunk_appendf(chunk, " htx=%p(size=%u,data=%u,used=%u,wrap=%s,flags=0x%08x,extra=%llu," + "first=%d,head=%d,tail=%d,tail_addr=%d,head_addr=%d,end_addr=%d)", + htx, htx->size, htx->data, htx_nbblks(htx), (!htx->head_addr) ? "NO" : "YES", + htx->flags, (unsigned long long)htx->extra, htx->first, htx->head, htx->tail, + htx->tail_addr, htx->head_addr, htx->end_addr); + + if (!full || !htx_nbblks(htx)) + return; + chunk_memcat(chunk, "\n", 1); + + for (pos = htx_get_head(htx); pos != -1; pos = htx_get_next(htx, pos)) { + struct htx_sl *sl; + struct htx_blk *blk = htx_get_blk(htx, pos); + enum htx_blk_type type = htx_get_blk_type(blk); + uint32_t sz = htx_get_blksz(blk); + struct ist n, v; + + n = htx_get_blk_name(htx, blk); + v = htx_get_blk_value(htx, blk); + + if (type == HTX_BLK_REQ_SL || type == HTX_BLK_RES_SL) { + sl = htx_get_blk_ptr(htx, blk); + chunk_appendf(chunk, "\t\t[%u] type=%-17s - size=%-6u - addr=%-6u\t%.*s %.*s %.*s\n", + pos, htx_blk_type_str(type), sz, blk->addr, + HTX_SL_P1_LEN(sl), HTX_SL_P1_PTR(sl), + HTX_SL_P2_LEN(sl), HTX_SL_P2_PTR(sl), + HTX_SL_P3_LEN(sl), HTX_SL_P3_PTR(sl)); + } + else if (type == HTX_BLK_HDR || type == HTX_BLK_TLR) + chunk_appendf(chunk, "\t\t[%u] type=%-17s - size=%-6u - addr=%-6u\t%.*s: %.*s\n", + pos, htx_blk_type_str(type), sz, blk->addr, + (int)MIN(n.len, 32), n.ptr, + (int)MIN(v.len, 64), v.ptr); + else + chunk_appendf(chunk, "\t\t[%u] type=%-17s - size=%-6u - addr=%-6u%s\n", + pos, htx_blk_type_str(type), sz, blk->addr, + (!v.len ? "\t<empty>" : "")); + } +} + +#endif /* _HAPROXY_HTX_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/init-t.h b/include/haproxy/init-t.h new file mode 100644 index 0000000..110171b --- /dev/null +++ b/include/haproxy/init-t.h @@ -0,0 +1,64 @@ +#ifndef _HAPROXY_INIT_T_H +#define _HAPROXY_INIT_T_H + +#include <haproxy/list-t.h> + +struct proxy; +struct server; + +struct pre_check_fct { + struct list list; + int (*fct)(); +}; + +struct post_check_fct { + struct list list; + int (*fct)(); +}; + +struct post_proxy_check_fct { + struct list list; + int (*fct)(struct proxy *); +}; + +struct post_server_check_fct { + struct list list; + int (*fct)(struct server *); +}; + +struct per_thread_alloc_fct { + struct list list; + int (*fct)(); +}; + +struct per_thread_init_fct { + struct list list; + int (*fct)(); +}; + +struct post_deinit_fct { + struct list list; + void (*fct)(); +}; + +struct proxy_deinit_fct { + struct list list; + void (*fct)(struct proxy *); +}; + +struct server_deinit_fct { + struct list list; + void (*fct)(struct server *); +}; + +struct per_thread_free_fct { + struct list list; + void (*fct)(); +}; + +struct per_thread_deinit_fct { + struct list list; + void (*fct)(); +}; + +#endif /* _HAPROXY_INIT_T_H */ diff --git a/include/haproxy/init.h b/include/haproxy/init.h new file mode 100644 index 0000000..6e30475 --- /dev/null +++ b/include/haproxy/init.h @@ -0,0 +1,79 @@ +#ifndef _HAPROXY_INIT_H +#define _HAPROXY_INIT_H + +#include <haproxy/init-t.h> +#include <haproxy/initcall.h> + +struct proxy; +struct server; + +extern struct list pre_check_list; +extern struct list post_check_list; +extern struct list post_proxy_check_list; +extern struct list post_server_check_list; +extern struct list per_thread_alloc_list; +extern struct list per_thread_init_list; +extern struct list post_deinit_list; +extern struct list proxy_deinit_list; +extern struct list server_deinit_list; +extern struct list per_thread_free_list; +extern struct list per_thread_deinit_list; + +void hap_register_pre_check(int (*fct)()); +void hap_register_post_check(int (*fct)()); +void hap_register_post_proxy_check(int (*fct)(struct proxy *)); +void hap_register_post_server_check(int (*fct)(struct server *)); +void hap_register_post_deinit(void (*fct)()); +void hap_register_proxy_deinit(void (*fct)(struct proxy *)); +void hap_register_server_deinit(void (*fct)(struct server *)); + +void hap_register_per_thread_alloc(int (*fct)()); +void hap_register_per_thread_init(int (*fct)()); +void hap_register_per_thread_deinit(void (*fct)()); +void hap_register_per_thread_free(void (*fct)()); + +/* simplified way to declare a pre-check callback in a file */ +#define REGISTER_PRE_CHECK(fct) \ + INITCALL1(STG_REGISTER, hap_register_pre_check, (fct)) + +/* simplified way to declare a post-check callback in a file */ +#define REGISTER_POST_CHECK(fct) \ + INITCALL1(STG_REGISTER, hap_register_post_check, (fct)) + +/* simplified way to declare a post-proxy-check callback in a file */ +#define REGISTER_POST_PROXY_CHECK(fct) \ + INITCALL1(STG_REGISTER, hap_register_post_proxy_check, (fct)) + +/* simplified way to declare a post-server-check callback in a file */ +#define REGISTER_POST_SERVER_CHECK(fct) \ + INITCALL1(STG_REGISTER, hap_register_post_server_check, (fct)) + +/* simplified way to declare a post-deinit callback in a file */ +#define REGISTER_POST_DEINIT(fct) \ + INITCALL1(STG_REGISTER, hap_register_post_deinit, (fct)) + +/* simplified way to declare a proxy-deinit callback in a file */ +#define REGISTER_PROXY_DEINIT(fct) \ + INITCALL1(STG_REGISTER, hap_register_proxy_deinit, (fct)) + +/* simplified way to declare a proxy-deinit callback in a file */ +#define REGISTER_SERVER_DEINIT(fct) \ + INITCALL1(STG_REGISTER, hap_register_server_deinit, (fct)) + +/* simplified way to declare a per-thread allocation callback in a file */ +#define REGISTER_PER_THREAD_ALLOC(fct) \ + INITCALL1(STG_REGISTER, hap_register_per_thread_alloc, (fct)) + +/* simplified way to declare a per-thread init callback in a file */ +#define REGISTER_PER_THREAD_INIT(fct) \ + INITCALL1(STG_REGISTER, hap_register_per_thread_init, (fct)) + +/* simplified way to declare a per-thread deinit callback in a file */ +#define REGISTER_PER_THREAD_DEINIT(fct) \ + INITCALL1(STG_REGISTER, hap_register_per_thread_deinit, (fct)) + +/* simplified way to declare a per-thread free callback in a file */ +#define REGISTER_PER_THREAD_FREE(fct) \ + INITCALL1(STG_REGISTER, hap_register_per_thread_free, (fct)) + +#endif /* _HAPROXY_INIT_H */ diff --git a/include/haproxy/initcall.h b/include/haproxy/initcall.h new file mode 100644 index 0000000..dffec04 --- /dev/null +++ b/include/haproxy/initcall.h @@ -0,0 +1,257 @@ +/* + * include/haproxy/initcall.h + * + * Initcall management. + * + * Copyright (C) 2018-2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_INITCALL_H +#define _HAPROXY_INITCALL_H + +#include <haproxy/compiler.h> + +/* List of known init stages. If others are added, please declare their + * section at the end of the file below. + */ + +/* The principle of the initcalls is to create optional sections in the target + * program which are made of arrays of structures containing a function pointer + * and 3 argument pointers. Then at boot time, these sections are scanned in a + * well defined order to call in turn each of these functions with their + * arguments. This allows to declare register callbacks in C files without + * having to export lots of things nor to cross-reference functions. There are + * several initialization stages defined so that certain guarantees are offered + * (for example list heads might or might not be initialized, pools might or + * might not have been created yet). + * + * On some very old platforms there is no convenient way to retrieve the start + * or stop pointer for these sections so there is no reliable way to enumerate + * the callbacks. When this is the case, as detected when USE_OBSOLETE_LINKER + * is set, instead of using sections we exclusively use constructors whose name + * is based on the current line number in the file to guarantee uniqueness. + * When called, these constructors then add their callback to their respective + * list. It works as well but slightly inflates the executable's size since + * code has to be emitted just to register each of these callbacks. + */ + +/* + * Please keep those names short enough, they are used to generate section + * names, Mac OS X accepts section names up to 16 characters, and we prefix + * them with i_, so stage name can't be more than 14 characters. + */ +enum init_stage { + STG_PREPARE = 0, // preset variables, tables, list heads + STG_LOCK, // pre-initialize locks + STG_REGISTER, // register static lists (keywords etc) + STG_ALLOC, // allocate required structures + STG_POOL, // create pools + STG_INIT, // subsystems normal initialization + STG_SIZE // size of the stages array, must be last +}; + +/* This is the descriptor for an initcall */ +struct initcall { + void (*const fct)(void *arg1, void *arg2, void *arg3); + void *arg1; + void *arg2; + void *arg3; +#if defined(USE_OBSOLETE_LINKER) + void *next; +#endif +}; + + +#if !defined(USE_OBSOLETE_LINKER) + +#define HA_INIT_SECTION(s) HA_SECTION("i_" # s) + +/* Declare a static variable in the init section dedicated to stage <stg>, + * with an element referencing function <function> and arguments <a1..a3>. + * <linenum> is needed to deduplicate entries created from a same file. The + * trick with (stg<STG_SIZE) consists in verifying that stg if a valid enum + * value from the initcall set, and to emit a warning or error if it is not. + * The function's type is cast so that it is technically possible to call a + * function taking other argument types, provided they are all the same size + * as a pointer (args are cast to (void*)). Do not use this macro directly, + * use INITCALL{0..3}() instead. + */ +#define __DECLARE_INITCALL(stg, linenum, function, a1, a2, a3) \ + HA_GLOBL(__start_i_##stg ); \ + HA_GLOBL(__stop_i_##stg ); \ + static const struct initcall *__initcb_##linenum \ + __attribute__((__used__)) HA_INIT_SECTION(stg) = \ + (stg < STG_SIZE) ? &(const struct initcall) { \ + .fct = (void (*)(void *,void *,void *))function, \ + .arg1 = (void *)(a1), \ + .arg2 = (void *)(a2), \ + .arg3 = (void *)(a3), \ + } : NULL + + +#else // USE_OBSOLETE_LINKER + +/* Declare a static constructor function to register a static descriptor for + * stage <stg>, with an element referencing function <function> and arguments + * <a1..a3>. <linenum> is needed to deduplicate entries created from a same + * file. The trick with (stg<STG_SIZE) consists in verifying that stg if a + * valid enum value from the initcall set, and to emit a warning or error if + * it is not. + * The function's type is cast so that it is technically possible to call a + * function taking other argument types, provided they are all the same size + * as a pointer (args are cast to (void*)). Do not use this macro directly, + * use INITCALL{0..3}() instead. + */ +#define __DECLARE_INITCALL(stg, linenum, function, a1, a2, a3) \ +__attribute__((constructor)) static void __initcb_##linenum() \ +{ \ + static struct initcall entry = { \ + .fct = (void (*)(void *,void *,void *))function, \ + .arg1 = (void *)(a1), \ + .arg2 = (void *)(a2), \ + .arg3 = (void *)(a3), \ + }; \ + if (stg < STG_SIZE) { \ + entry.next = __initstg[stg]; \ + __initstg[stg] = &entry; \ + }; \ +} + +#endif // USE_OBSOLETE_LINKER + +/* This is used to resolve <linenum> to an integer before calling + * __DECLARE_INITCALL(). Do not use this macro directly, use INITCALL{0..3}() + * instead. + */ +#define _DECLARE_INITCALL(...) \ + __DECLARE_INITCALL(__VA_ARGS__) + +/* This requires that function <function> is called with pointer argument + * <argument> during init stage <stage> which must be one of init_stage. + */ +#define INITCALL0(stage, function) \ + _DECLARE_INITCALL(stage, __LINE__, function, 0, 0, 0) + +/* This requires that function <function> is called with pointer argument + * <argument> during init stage <stage> which must be one of init_stage. + */ +#define INITCALL1(stage, function, arg1) \ + _DECLARE_INITCALL(stage, __LINE__, function, arg1, 0, 0) + +/* This requires that function <function> is called with pointer arguments + * <arg1..2> during init stage <stage> which must be one of init_stage. + */ +#define INITCALL2(stage, function, arg1, arg2) \ + _DECLARE_INITCALL(stage, __LINE__, function, arg1, arg2, 0) + +/* This requires that function <function> is called with pointer arguments + * <arg1..3> during init stage <stage> which must be one of init_stage. + */ +#define INITCALL3(stage, function, arg1, arg2, arg3) \ + _DECLARE_INITCALL(stage, __LINE__, function, arg1, arg2, arg3) + +#if !defined(USE_OBSOLETE_LINKER) +/* Iterate pointer p (of type initcall**) over all registered calls at + * stage <stg>. + */ +#define FOREACH_INITCALL(p,stg) \ + for ((p) = &(__start_i_##stg); (p) < &(__stop_i_##stg); (p)++) + +#else // USE_OBSOLETE_LINKER + +#define FOREACH_INITCALL(p,stg) \ + for ((p) = __initstg[stg]; (p); (p) = (p)->next) +#endif // USE_OBSOLETE_LINKER + + +#if !defined(USE_OBSOLETE_LINKER) +/* Declare a section for stage <stg>. The start and stop pointers are set by + * the linker itself, which is why they're declared extern here. The weak + * attribute is used so that we declare them ourselves if the section is + * empty. The corresponding sections must contain exclusively pointers to + * make sure each location may safely be visited by incrementing a pointer. + */ +#define DECLARE_INIT_SECTION(stg) \ + extern __attribute__((__weak__)) const struct initcall *__start_i_##stg HA_SECTION_START("i_" # stg); \ + extern __attribute__((__weak__)) const struct initcall *__stop_i_##stg HA_SECTION_STOP("i_" # stg) + +/* Declare all initcall sections here */ +DECLARE_INIT_SECTION(STG_PREPARE); +DECLARE_INIT_SECTION(STG_LOCK); +DECLARE_INIT_SECTION(STG_REGISTER); +DECLARE_INIT_SECTION(STG_ALLOC); +DECLARE_INIT_SECTION(STG_POOL); +DECLARE_INIT_SECTION(STG_INIT); + +// for use in the main haproxy.c file +#define DECLARE_INIT_STAGES asm("") + +/* not needed anymore */ +#undef DECLARE_INIT_SECTION + +#else // USE_OBSOLETE_LINKER + +extern struct initcall *__initstg[STG_SIZE]; + +// for use in the main haproxy.c file +#define DECLARE_INIT_STAGES struct initcall *__initstg[STG_SIZE] + +#endif // USE_OBSOLETE_LINKER + +#if !defined(USE_OBSOLETE_LINKER) +/* Run the initcalls for stage <stg>. The test on <stg> is only there to + * ensure it is a valid initcall stage. + */ +#define RUN_INITCALLS(stg) \ + do { \ + const struct initcall **ptr; \ + if (stg >= STG_SIZE) \ + break; \ + FOREACH_INITCALL(ptr, stg) \ + (*ptr)->fct((*ptr)->arg1, (*ptr)->arg2, (*ptr)->arg3); \ + } while (0) + +#else // USE_OBSOLETE_LINKER + +/* Run the initcalls for stage <stg>. The test on <stg> is only there to + * ensure it is a valid initcall stage. + */ +#define RUN_INITCALLS(stg) \ + do { \ + const struct initcall *ptr; \ + if (stg >= STG_SIZE) \ + break; \ + FOREACH_INITCALL(ptr, stg) \ + (ptr)->fct((ptr)->arg1, (ptr)->arg2, (ptr)->arg3); \ + } while (0) + +#endif // USE_OBSOLETE_LINKER + +#endif /* _HAPROXY_INITCALL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/intops.h b/include/haproxy/intops.h new file mode 100644 index 0000000..34010cc --- /dev/null +++ b/include/haproxy/intops.h @@ -0,0 +1,495 @@ +/* + * include/haproxy/intops.h + * Functions for integer operations. + * + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#ifndef _HAPROXY_INTOPS_H +#define _HAPROXY_INTOPS_H + +#include <haproxy/api.h> + +/* exported functions, mostly integer parsing */ +/* rounds <i> down to the closest value having max 2 digits */ +unsigned int round_2dig(unsigned int i); +unsigned int full_hash(unsigned int a); +int varint_bytes(uint64_t v); +unsigned int read_uint(const char **s, const char *end); +long long read_int64(const char **s, const char *end); +unsigned long long read_uint64(const char **s, const char *end); +unsigned int str2ui(const char *s); +unsigned int str2uic(const char *s); +unsigned int strl2ui(const char *s, int len); +unsigned int strl2uic(const char *s, int len); +int strl2ic(const char *s, int len); +int strl2irc(const char *s, int len, int *ret); +int strl2llrc(const char *s, int len, long long *ret); +int strl2llrc_dotted(const char *text, int len, long long *ret); +unsigned int mask_find_rank_bit(unsigned int r, unsigned long m); +unsigned int mask_find_rank_bit_fast(unsigned int r, unsigned long m, + unsigned long a, unsigned long b, + unsigned long c, unsigned long d); +void mask_prep_rank_map(unsigned long m, + unsigned long *a, unsigned long *b, + unsigned long *c, unsigned long *d); +int one_among_mask(unsigned long v, int bit); + + +/* Multiply the two 32-bit operands and shift the 64-bit result right 32 bits. + * This is used to compute fixed ratios by setting one of the operands to + * (2^32*ratio). + */ +static inline unsigned int mul32hi(unsigned int a, unsigned int b) +{ + return ((unsigned long long)a * b + a) >> 32; +} + +/* gcc does not know when it can safely divide 64 bits by 32 bits. Use this + * function when you know for sure that the result fits in 32 bits, because + * it is optimal on x86 and on 64bit processors. + */ +static inline unsigned int div64_32(unsigned long long o1, unsigned int o2) +{ + unsigned long long result; +#ifdef __i386__ + asm("divl %2" + : "=A" (result) + : "A"(o1), "rm"(o2)); +#else + result = o1 / o2; +#endif + return result; +} + +/* rotate left a 64-bit integer by <bits:[0-5]> bits */ +static inline uint64_t rotl64(uint64_t v, uint8_t bits) +{ +#if !defined(__ARM_ARCH_8A) && !defined(__x86_64__) + bits &= 63; +#endif + v = (v << bits) | (v >> (-bits & 63)); + return v; +} + +/* rotate right a 64-bit integer by <bits:[0-5]> bits */ +static inline uint64_t rotr64(uint64_t v, uint8_t bits) +{ +#if !defined(__ARM_ARCH_8A) && !defined(__x86_64__) + bits &= 63; +#endif + v = (v >> bits) | (v << (-bits & 63)); + return v; +} + +/* Simple popcountl implementation. It returns the number of ones in a word. + * Described here : https://graphics.stanford.edu/~seander/bithacks.html + */ +static inline unsigned int my_popcountl(unsigned long a) +{ + a = a - ((a >> 1) & ~0UL/3); + a = (a & ~0UL/15*3) + ((a >> 2) & ~0UL/15*3); + a = (a + (a >> 4)) & ~0UL/255*15; + return (unsigned long)(a * (~0UL/255)) >> (sizeof(unsigned long) - 1) * 8; +} + +/* returns non-zero if <a> has at least 2 bits set */ +static inline unsigned long atleast2(unsigned long a) +{ + return a & (a - 1); +} + +/* Simple ffs implementation. It returns the position of the lowest bit set to + * one, starting at 1. It is illegal to call it with a==0 (undefined result). + */ +static inline unsigned int my_ffsl(unsigned long a) +{ + unsigned long cnt; + +#if defined(__x86_64__) + __asm__("bsf %1,%0\n" : "=r" (cnt) : "rm" (a)); + cnt++; +#else + + cnt = 1; +#if LONG_MAX > 0x7FFFFFFFL /* 64bits */ + if (!(a & 0xFFFFFFFFUL)) { + a >>= 32; + cnt += 32; + } +#endif + if (!(a & 0XFFFFU)) { + a >>= 16; + cnt += 16; + } + if (!(a & 0XFF)) { + a >>= 8; + cnt += 8; + } + if (!(a & 0xf)) { + a >>= 4; + cnt += 4; + } + if (!(a & 0x3)) { + a >>= 2; + cnt += 2; + } + if (!(a & 0x1)) { + cnt += 1; + } +#endif /* x86_64 */ + + return cnt; +} + +/* Simple fls implementation. It returns the position of the highest bit set to + * one, starting at 1. It is illegal to call it with a==0 (undefined result). + */ +static inline unsigned int my_flsl(unsigned long a) +{ + unsigned long cnt; + +#if defined(__x86_64__) + __asm__("bsr %1,%0\n" : "=r" (cnt) : "rm" (a)); + cnt++; +#else + + cnt = 1; +#if LONG_MAX > 0x7FFFFFFFUL /* 64bits */ + if (a & 0xFFFFFFFF00000000UL) { + a >>= 32; + cnt += 32; + } +#endif + if (a & 0XFFFF0000U) { + a >>= 16; + cnt += 16; + } + if (a & 0XFF00) { + a >>= 8; + cnt += 8; + } + if (a & 0xf0) { + a >>= 4; + cnt += 4; + } + if (a & 0xc) { + a >>= 2; + cnt += 2; + } + if (a & 0x2) { + cnt += 1; + } +#endif /* x86_64 */ + + return cnt; +} + +/* Build a word with the <bits> lower bits set (reverse of my_popcountl) */ +static inline unsigned long nbits(int bits) +{ + if (--bits < 0) + return 0; + else + return (2UL << bits) - 1; +} + +/* Turns 64-bit value <a> from host byte order to network byte order. + * The principle consists in letting the compiler detect we're playing + * with a union and simplify most or all operations. The asm-optimized + * htonl() version involving bswap (x86) / rev (arm) / other is a single + * operation on little endian, or a NOP on big-endian. In both cases, + * this lets the compiler "see" that we're rebuilding a 64-bit word from + * two 32-bit quantities that fit into a 32-bit register. In big endian, + * the whole code is optimized out. In little endian, with a decent compiler, + * a few bswap and 2 shifts are left, which is the minimum acceptable. + */ +static inline unsigned long long my_htonll(unsigned long long a) +{ +#if defined(__x86_64__) + __asm__ volatile("bswapq %0" : "=r"(a) : "0"(a)); + return a; +#else + union { + struct { + unsigned int w1; + unsigned int w2; + } by32; + unsigned long long by64; + } w = { .by64 = a }; + return ((unsigned long long)htonl(w.by32.w1) << 32) | htonl(w.by32.w2); +#endif +} + +/* Turns 64-bit value <a> from network byte order to host byte order. */ +static inline unsigned long long my_ntohll(unsigned long long a) +{ + return my_htonll(a); +} + +/* sets bit <bit> into map <map>, which must be long-aligned */ +static inline void ha_bit_set(unsigned long bit, long *map) +{ + map[bit / (8 * sizeof(*map))] |= 1UL << (bit & (8 * sizeof(*map) - 1)); +} + +/* clears bit <bit> from map <map>, which must be long-aligned */ +static inline void ha_bit_clr(unsigned long bit, long *map) +{ + map[bit / (8 * sizeof(*map))] &= ~(1UL << (bit & (8 * sizeof(*map) - 1))); +} + +/* flips bit <bit> from map <map>, which must be long-aligned */ +static inline void ha_bit_flip(unsigned long bit, long *map) +{ + map[bit / (8 * sizeof(*map))] ^= 1UL << (bit & (8 * sizeof(*map) - 1)); +} + +/* returns non-zero if bit <bit> from map <map> is set, otherwise 0 */ +static inline int ha_bit_test(unsigned long bit, const long *map) +{ + return !!(map[bit / (8 * sizeof(*map))] & 1UL << (bit & (8 * sizeof(*map) - 1))); +} + +/* hash a 32-bit integer to another 32-bit integer. This code may be large when + * inlined, use full_hash() instead. + */ +static inline unsigned int __full_hash(unsigned int a) +{ + /* This function is one of Bob Jenkins' full avalanche hashing + * functions, which when provides quite a good distribution for little + * input variations. The result is quite suited to fit over a 32-bit + * space with enough variations so that a randomly picked number falls + * equally before any server position. + * Check http://burtleburtle.net/bob/hash/integer.html for more info. + */ + a = (a+0x7ed55d16) + (a<<12); + a = (a^0xc761c23c) ^ (a>>19); + a = (a+0x165667b1) + (a<<5); + a = (a+0xd3a2646c) ^ (a<<9); + a = (a+0xfd7046c5) + (a<<3); + a = (a^0xb55a4f09) ^ (a>>16); + + /* ensure values are better spread all around the tree by multiplying + * by a large prime close to 3/4 of the tree. + */ + return a * 3221225473U; +} + +/* + * Return integer equivalent of character <c> for a hex digit (0-9, a-f, A-F), + * otherwise -1. This compact form helps gcc produce efficient code. + */ +static inline int hex2i(int c) +{ + if ((unsigned char)(c -= '0') > 9) { + if ((unsigned char)(c -= 'A' - '0') > 5 && + (unsigned char)(c -= 'a' - 'A') > 5) + c = -11; + c += 10; + } + return c; +} + +/* This one is 6 times faster than strtoul() on athlon, but does + * no check at all. + */ +static inline unsigned int __str2ui(const char *s) +{ + unsigned int i = 0; + while (*s) { + i = i * 10 - '0'; + i += (unsigned char)*s++; + } + return i; +} + +/* This one is 5 times faster than strtoul() on athlon with checks. + * It returns the value of the number composed of all valid digits read. + */ +static inline unsigned int __str2uic(const char *s) +{ + unsigned int i = 0; + unsigned int j; + + while (1) { + j = (*s++) - '0'; + if (j > 9) + break; + i *= 10; + i += j; + } + return i; +} + +/* This one is 28 times faster than strtoul() on athlon, but does + * no check at all! + */ +static inline unsigned int __strl2ui(const char *s, int len) +{ + unsigned int i = 0; + + while (len-- > 0) { + i = i * 10 - '0'; + i += (unsigned char)*s++; + } + return i; +} + +/* This one is 7 times faster than strtoul() on athlon with checks. + * It returns the value of the number composed of all valid digits read. + */ +static inline unsigned int __strl2uic(const char *s, int len) +{ + unsigned int i = 0; + unsigned int j, k; + + while (len-- > 0) { + j = (*s++) - '0'; + k = i * 10; + if (j > 9) + break; + i = k + j; + } + return i; +} + +/* This function reads an unsigned integer from the string pointed to by <s> + * and returns it. The <s> pointer is adjusted to point to the first unread + * char. The function automatically stops at <end>. + */ +static inline unsigned int __read_uint(const char **s, const char *end) +{ + const char *ptr = *s; + unsigned int i = 0; + unsigned int j, k; + + while (ptr < end) { + j = *ptr - '0'; + k = i * 10; + if (j > 9) + break; + i = k + j; + ptr++; + } + *s = ptr; + return i; +} + +/* returns the number of bytes needed to encode <v> as a varint. Be careful, use + * it only with constants as it generates a large code (typ. 180 bytes). Use the + * varint_bytes() version instead in case of doubt. + */ +static inline int __varint_bytes(uint64_t v) +{ + switch (v) { + case 0x0000000000000000ULL ... 0x00000000000000efULL: return 1; + case 0x00000000000000f0ULL ... 0x00000000000008efULL: return 2; + case 0x00000000000008f0ULL ... 0x00000000000408efULL: return 3; + case 0x00000000000408f0ULL ... 0x00000000020408efULL: return 4; + case 0x00000000020408f0ULL ... 0x00000001020408efULL: return 5; + case 0x00000001020408f0ULL ... 0x00000081020408efULL: return 6; + case 0x00000081020408f0ULL ... 0x00004081020408efULL: return 7; + case 0x00004081020408f0ULL ... 0x00204081020408efULL: return 8; + case 0x00204081020408f0ULL ... 0x10204081020408efULL: return 9; + default: return 10; + } +} + +/* Encode the integer <i> into a varint (variable-length integer). The encoded + * value is copied in <*buf>. Here is the encoding format: + * + * 0 <= X < 240 : 1 byte (7.875 bits) [ XXXX XXXX ] + * 240 <= X < 2288 : 2 bytes (11 bits) [ 1111 XXXX ] [ 0XXX XXXX ] + * 2288 <= X < 264432 : 3 bytes (18 bits) [ 1111 XXXX ] [ 1XXX XXXX ] [ 0XXX XXXX ] + * 264432 <= X < 33818864 : 4 bytes (25 bits) [ 1111 XXXX ] [ 1XXX XXXX ]*2 [ 0XXX XXXX ] + * 33818864 <= X < 4328786160 : 5 bytes (32 bits) [ 1111 XXXX ] [ 1XXX XXXX ]*3 [ 0XXX XXXX ] + * ... + * + * On success, it returns the number of written bytes and <*buf> is moved after + * the encoded value. Otherwise, it returns -1. */ +static inline int encode_varint(uint64_t i, char **buf, char *end) +{ + unsigned char *p = (unsigned char *)*buf; + int r; + + if (p >= (unsigned char *)end) + return -1; + + if (i < 240) { + *p++ = i; + *buf = (char *)p; + return 1; + } + + *p++ = (unsigned char)i | 240; + i = (i - 240) >> 4; + while (i >= 128) { + if (p >= (unsigned char *)end) + return -1; + *p++ = (unsigned char)i | 128; + i = (i - 128) >> 7; + } + + if (p >= (unsigned char *)end) + return -1; + *p++ = (unsigned char)i; + + r = ((char *)p - *buf); + *buf = (char *)p; + return r; +} + +/* Decode a varint from <*buf> and save the decoded value in <*i>. See + * 'spoe_encode_varint' for details about varint. + * On success, it returns the number of read bytes and <*buf> is moved after the + * varint. Otherwise, it returns -1. */ +static inline int decode_varint(char **buf, char *end, uint64_t *i) +{ + unsigned char *p = (unsigned char *)*buf; + int r; + + if (p >= (unsigned char *)end) + return -1; + + *i = *p++; + if (*i < 240) { + *buf = (char *)p; + return 1; + } + + r = 4; + do { + if (p >= (unsigned char *)end) + return -1; + *i += (uint64_t)*p << r; + r += 7; + } while (*p++ >= 128); + + r = ((char *)p - *buf); + *buf = (char *)p; + return r; +} + +#endif /* _HAPROXY_INTOPS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/istbuf.h b/include/haproxy/istbuf.h new file mode 100644 index 0000000..392ec46 --- /dev/null +++ b/include/haproxy/istbuf.h @@ -0,0 +1,162 @@ +/* + * include/haproxy/istbuf.h + * Functions used to manipulate indirect strings with wrapping buffers. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _HAPROXY_ISTBUF_H +#define _HAPROXY_ISTBUF_H + +#include <sys/types.h> +#include <import/ist.h> +#include <haproxy/buf.h> + + +/* b_isteq() : returns > 0 if the first <n> characters of buffer <b> starting + * at offset <o> relative to the buffer's head match <ist>. (empty strings do + * match). It is designed to be used with reasonably small strings (it matches + * a single byte per loop iteration). It is expected to be used with an offset + * to skip old data. For example : + * - "input" contents : b_isteq(b, old_cnt, new_cnt, ist); + * - "output" contents : b_isteq(b, 0, old_cnt, ist); + * Return value : + * >0 : the number of matching bytes + * =0 : not enough bytes (or matching of empty string) + * <0 : non-matching byte found + */ +static inline ssize_t b_isteq(const struct buffer *b, size_t o, size_t n, const struct ist ist) +{ + struct ist r = ist; + const char *p; + const char *end = b_wrap(b); + + if (n < r.len) + return 0; + + p = b_peek(b, o); + while (r.len--) { + if (*p++ != *r.ptr++) + return -1; + if (unlikely(p == end)) + p = b_orig(b); + } + return ist.len; +} + +/* Same as b_isteq but case-insensitive */ +static inline ssize_t b_isteqi(const struct buffer *b, size_t o, size_t n, const struct ist ist) +{ + struct ist r = ist; + const char *p; + const char *end = b_wrap(b); + + if (n < r.len) + return 0; + + p = b_peek(b, o); + while (r.len--) { + if (*p != *r.ptr && + ist_lc[(unsigned char)*p] != ist_lc[(unsigned char)*r.ptr]) + return -1; + p++; + r.ptr++; + if (unlikely(p == end)) + p = b_orig(b); + } + return ist.len; +} + +/* b_isteat() : "eats" string <ist> from the head of buffer <b>. Wrapping data + * is explicitly supported. It matches a single byte per iteration so strings + * should remain reasonably small. Returns : + * > 0 : number of bytes matched and eaten + * = 0 : not enough bytes (or matching an empty string) + * < 0 : non-matching byte found + */ +static inline ssize_t b_isteat(struct buffer *b, const struct ist ist) +{ + ssize_t ret = b_isteq(b, 0, b_data(b), ist); + + if (ret > 0) + b_del(b, ret); + return ret; +} + +/* b_istput() : injects string <ist> at the tail of output buffer <b> provided + * that it fits. Wrapping is supported. It's designed for small strings as it + * only writes a single byte per iteration. Returns the number of characters + * copied (ist.len), 0 if it temporarily does not fit, or -1 if it will never + * fit. It will only modify the buffer upon success. In all cases, the contents + * are copied prior to reporting an error, so that the destination at least + * contains a valid but truncated string. + */ +static inline ssize_t b_istput(struct buffer *b, const struct ist ist) +{ + const char *end = b_wrap(b); + struct ist r = ist; + char *p; + + if (r.len > (size_t)b_room(b)) + return r.len < b->size ? 0 : -1; + + p = b_tail(b); + b->data += r.len; + while (r.len--) { + *p++ = *r.ptr++; + if (unlikely(p == end)) + p = b_orig(b); + } + return ist.len; +} + +/* b_putist() : tries to copy as much as possible of string <ist> into buffer + * <b> and returns the number of bytes copied (truncation is possible). It uses + * b_putblk() and is suitable for large blocks. + */ +static inline size_t b_putist(struct buffer *b, const struct ist ist) +{ + return b_putblk(b, ist.ptr, ist.len); +} + +/* builds and return a <struct buffer> based on <ist> + */ +static inline struct buffer ist2buf(const struct ist ist) +{ + struct buffer buf; + + buf.area = ist.ptr; + buf.size = ist.len; + buf.data = ist.len; + buf.head = 0; + return buf; +} + +#endif /* _HAPROXY_ISTBUF_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/jwt-t.h b/include/haproxy/jwt-t.h new file mode 100644 index 0000000..e94607e --- /dev/null +++ b/include/haproxy/jwt-t.h @@ -0,0 +1,86 @@ +/* + * include/haproxy/jwt-t.h + * Macros, variables and structures for JWT management. + * + * Copyright (C) 2021 HAProxy Technologies, Remi Tricot-Le Breton <rlebreton@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_JWT_T_H +#define _HAPROXY_JWT_T_H + +#include <haproxy/openssl-compat.h> + +#ifdef USE_OPENSSL +enum jwt_alg { + JWT_ALG_DEFAULT, + JWS_ALG_NONE, + JWS_ALG_HS256, + JWS_ALG_HS384, + JWS_ALG_HS512, + JWS_ALG_RS256, + JWS_ALG_RS384, + JWS_ALG_RS512, + JWS_ALG_ES256, + JWS_ALG_ES384, + JWS_ALG_ES512, + JWS_ALG_PS256, + JWS_ALG_PS384, + JWS_ALG_PS512, +}; + +struct jwt_item { + char *start; + size_t length; +}; + +struct jwt_ctx { + enum jwt_alg alg; + struct jwt_item jose; + struct jwt_item claims; + struct jwt_item signature; + char *key; + unsigned int key_length; +}; + +enum jwt_elt { + JWT_ELT_JOSE = 0, + JWT_ELT_CLAIMS, + JWT_ELT_SIG, + JWT_ELT_MAX +}; + +struct jwt_cert_tree_entry { + EVP_PKEY *pkey; + struct ebmb_node node; + char path[VAR_ARRAY]; +}; + +enum jwt_vrfy_status { + JWT_VRFY_KO = 0, + JWT_VRFY_OK = 1, + + JWT_VRFY_UNKNOWN_ALG = -1, + JWT_VRFY_UNMANAGED_ALG = -2, + JWT_VRFY_INVALID_TOKEN = -3, + JWT_VRFY_OUT_OF_MEMORY = -4, + JWT_VRFY_UNKNOWN_CERT = -5 +}; + +#endif /* USE_OPENSSL */ + + +#endif /* _HAPROXY_JWT_T_H */ diff --git a/include/haproxy/jwt.h b/include/haproxy/jwt.h new file mode 100644 index 0000000..a343ffa --- /dev/null +++ b/include/haproxy/jwt.h @@ -0,0 +1,37 @@ +/* + * include/haproxy/jwt.h + * Functions for JSON Web Token (JWT) management. + * + * Copyright (C) 2021 HAProxy Technologies, Remi Tricot-Le Breton <rlebreton@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_JWT_H +#define _HAPROXY_JWT_H + +#include <haproxy/jwt-t.h> +#include <haproxy/buf-t.h> + +#ifdef USE_OPENSSL +enum jwt_alg jwt_parse_alg(const char *alg_str, unsigned int alg_len); +int jwt_tokenize(const struct buffer *jwt, struct jwt_item *items, unsigned int *item_num); +int jwt_tree_load_cert(char *path, int pathlen, char **err); + +enum jwt_vrfy_status jwt_verify(const struct buffer *token, const struct buffer *alg, + const struct buffer *key); +#endif /* USE_OPENSSL */ + +#endif /* _HAPROXY_JWT_H */ diff --git a/include/haproxy/lb_chash-t.h b/include/haproxy/lb_chash-t.h new file mode 100644 index 0000000..c437981 --- /dev/null +++ b/include/haproxy/lb_chash-t.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/lb_chash-t.h + * Types for Consistent Hash LB algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_CHASH_T_H +#define _HAPROXY_LB_CHASH_T_H + +#include <import/ebtree-t.h> + +struct lb_chash { + struct eb_root act; /* weighted chash entries of active servers */ + struct eb_root bck; /* weighted chash entries of backup servers */ + struct eb32_node *last; /* last node found in case of round robin (or NULL) */ +}; + +#endif /* _HAPROXY_LB_CHASH_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_chash.h b/include/haproxy/lb_chash.h new file mode 100644 index 0000000..7950457 --- /dev/null +++ b/include/haproxy/lb_chash.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/lb_chash.h + * Function declarations for Consistent Hash LB algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_CHASH_H +#define _HAPROXY_LB_CHASH_H + +#include <haproxy/api.h> +#include <haproxy/lb_chash-t.h> + +struct proxy; +struct server; +int chash_init_server_tree(struct proxy *p); +struct server *chash_get_next_server(struct proxy *p, struct server *srvtoavoid); +struct server *chash_get_server_hash(struct proxy *p, unsigned int hash, const struct server *avoid); + +#endif /* _HAPROXY_LB_CHASH_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_fas-t.h b/include/haproxy/lb_fas-t.h new file mode 100644 index 0000000..cfb274c --- /dev/null +++ b/include/haproxy/lb_fas-t.h @@ -0,0 +1,39 @@ +/* + * include/types/lb_fas-t.h + * Types for First Available Server load balancing algorithm. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_FAS_T_H +#define _HAPROXY_LB_FAS_T_H + +#include <import/ebtree-t.h> + +struct lb_fas { + struct eb_root act; /* weighted least conns on the active servers */ + struct eb_root bck; /* weighted least conns on the backup servers */ +}; + +#endif /* _HAPROXY_LB_FAS_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_fas.h b/include/haproxy/lb_fas.h new file mode 100644 index 0000000..b12831c --- /dev/null +++ b/include/haproxy/lb_fas.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/lb_fas.h + * First Available Server load balancing algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_FAS_H +#define _HAPROXY_LB_FAS_H + +#include <haproxy/api.h> +#include <haproxy/lb_fas-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> + +struct server *fas_get_next_server(struct proxy *p, struct server *srvtoavoid); +void fas_init_server_tree(struct proxy *p); + +#endif /* _HAPROXY_LB_FAS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_fwlc-t.h b/include/haproxy/lb_fwlc-t.h new file mode 100644 index 0000000..258a6ab --- /dev/null +++ b/include/haproxy/lb_fwlc-t.h @@ -0,0 +1,39 @@ +/* + * include/haproxy/lb_fwlc-t.h + * Types for Fast Weighted Least Connection load balancing algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_FWLC_T_H +#define _HAPROXY_LB_FWLC_T_H + +#include <import/ebtree-t.h> + +struct lb_fwlc { + struct eb_root act; /* weighted least conns on the active servers */ + struct eb_root bck; /* weighted least conns on the backup servers */ +}; + +#endif /* _HAPROXY_LB_FWLC_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_fwlc.h b/include/haproxy/lb_fwlc.h new file mode 100644 index 0000000..a598af9 --- /dev/null +++ b/include/haproxy/lb_fwlc.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/lb_fwlc.h + * Fast Weighted Least Connection load balancing algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_FWLC_H +#define _HAPROXY_LB_FWLC_H + +#include <haproxy/api.h> +#include <haproxy/lb_fwlc-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> + +struct server *fwlc_get_next_server(struct proxy *p, struct server *srvtoavoid); +void fwlc_init_server_tree(struct proxy *p); + +#endif /* _HAPROXY_LB_FWLC_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_fwrr-t.h b/include/haproxy/lb_fwrr-t.h new file mode 100644 index 0000000..f7b746e --- /dev/null +++ b/include/haproxy/lb_fwrr-t.h @@ -0,0 +1,50 @@ +/* + * include/haproxy/lb_fwrr-t.h + * Types for Fast Weighted Round Robin load balancing algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_FWRR_T_H +#define _HAPROXY_LB_FWRR_T_H + +#include <import/ebtree-t.h> + +/* This structure is used to apply fast weighted round robin on a server group */ +struct fwrr_group { + struct eb_root curr; /* tree for servers in "current" time range */ + struct eb_root t0, t1; /* "init" and "next" servers */ + struct eb_root *init; /* servers waiting to be placed */ + struct eb_root *next; /* servers to be placed at next run */ + int curr_pos; /* current position in the tree */ + int curr_weight; /* total weight of the current time range */ + int next_weight; /* total weight of the next time range */ +}; + +struct lb_fwrr { + struct fwrr_group act; /* weighted round robin on the active servers */ + struct fwrr_group bck; /* weighted round robin on the backup servers */ +}; + +#endif /* _HAPROXY_LB_FWRR_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_fwrr.h b/include/haproxy/lb_fwrr.h new file mode 100644 index 0000000..27b0a94 --- /dev/null +++ b/include/haproxy/lb_fwrr.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/lb_fwrr.h + * Fast Weighted Round Robin load balancing algorithm. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_FWRR_H +#define _HAPROXY_LB_FWRR_H + +#include <haproxy/api.h> +#include <haproxy/lb_fwrr-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> + +void fwrr_init_server_groups(struct proxy *p); +struct server *fwrr_get_next_server(struct proxy *p, struct server *srvtoavoid); + +#endif /* _HAPROXY_LB_FWRR_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_map-t.h b/include/haproxy/lb_map-t.h new file mode 100644 index 0000000..6d1dd1a --- /dev/null +++ b/include/haproxy/lb_map-t.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/lb_map-t.h + * Types for map-based load-balancing (RR and HASH) + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_MAP_T_H +#define _HAPROXY_LB_MAP_T_H + +#include <haproxy/api-t.h> +#include <haproxy/server-t.h> + +struct lb_map { + struct server **srv; /* the server map used to apply weights */ + int rr_idx; /* next server to be elected in round robin mode */ +}; + +#endif /* _HAPROXY_LB_MAP_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/lb_map.h b/include/haproxy/lb_map.h new file mode 100644 index 0000000..ca483b2 --- /dev/null +++ b/include/haproxy/lb_map.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/lb_map.h + * Map-based load-balancing (RR and HASH) + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LB_MAP_H +#define _HAPROXY_LB_MAP_H + +#include <haproxy/api.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> + +void recalc_server_map(struct proxy *px); +void init_server_map(struct proxy *p); +struct server *map_get_server_rr(struct proxy *px, struct server *srvtoavoid); +struct server *map_get_server_hash(struct proxy *px, unsigned int hash); + +#endif /* _HAPROXY_LB_MAP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/linuxcap.h b/include/haproxy/linuxcap.h new file mode 100644 index 0000000..9c337a4 --- /dev/null +++ b/include/haproxy/linuxcap.h @@ -0,0 +1,7 @@ +#ifndef _HAPROXY_LINUXCAP_H +#define _HAPROXY_LINUXCAP_H + +int prepare_caps_for_setuid(int from_uid, int to_uid); +int finalize_caps_after_setuid(int from_uid, int to_uid); + +#endif /* _HAPROXY_LINUXCAP_H */ diff --git a/include/haproxy/list-t.h b/include/haproxy/list-t.h new file mode 100644 index 0000000..dd8493e --- /dev/null +++ b/include/haproxy/list-t.h @@ -0,0 +1,73 @@ +/* + * include/haproxy/list-t.h + * Circular list manipulation types definitions + * + * Copyright (C) 2002-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LIST_T_H +#define _HAPROXY_LIST_T_H + + +/* these are circular or bidirectionnal lists only. Each list pointer points to + * another list pointer in a structure, and not the structure itself. The + * pointer to the next element MUST be the first one so that the list is easily + * cast as a single linked list or pointer. + */ +struct list { + struct list *n; /* next */ + struct list *p; /* prev */ +}; + +/* This is similar to struct list, but we want to be sure the compiler will + * yell at you if you use macroes for one when you're using the other. You have + * to expicitely cast if that's really what you want to do. + */ +struct mt_list { + struct mt_list *next; + struct mt_list *prev; +}; + + +/* a back-ref is a pointer to a target list entry. It is used to detect when an + * element being deleted is currently being tracked by another user. The best + * example is a user dumping the session table. The table does not fit in the + * output buffer so we have to set a mark on a session and go on later. But if + * that marked session gets deleted, we don't want the user's pointer to go in + * the wild. So we can simply link this user's request to the list of this + * session's users, and put a pointer to the list element in ref, that will be + * used as the mark for next iteration. + */ +struct bref { + struct list users; + struct list *ref; /* pointer to the target's list entry */ +}; + +/* a word list is a generic list with a pointer to a string in each element. */ +struct wordlist { + struct list list; + char *s; +}; + +/* this is the same as above with an additional pointer to a condition. */ +struct cond_wordlist { + struct list list; + void *cond; + char *s; +}; + +#endif /* _HAPROXY_LIST_T_H */ diff --git a/include/haproxy/list.h b/include/haproxy/list.h new file mode 100644 index 0000000..368e6d7 --- /dev/null +++ b/include/haproxy/list.h @@ -0,0 +1,907 @@ +/* + * include/haproxy/list.h + * Circular list manipulation macros and functions. + * + * Copyright (C) 2002-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LIST_H +#define _HAPROXY_LIST_H + +#include <haproxy/api.h> +#include <haproxy/thread.h> + +/* First undefine some macros which happen to also be defined on OpenBSD, + * in sys/queue.h, used by sys/event.h + */ +#undef LIST_HEAD +#undef LIST_INIT +#undef LIST_NEXT + +/* ILH = Initialized List Head : used to prevent gcc from moving an empty + * list to BSS. Some older version tend to trim all the array and cause + * corruption. + */ +#define ILH { .n = (struct list *)1, .p = (struct list *)2 } + +#define LIST_HEAD(a) ((void *)(&(a))) + +#define LIST_INIT(l) ((l)->n = (l)->p = (l)) + +#define LIST_HEAD_INIT(l) { &l, &l } + +/* adds an element at the beginning of a list ; returns the element */ +#define LIST_INSERT(lh, el) ({ (el)->n = (lh)->n; (el)->n->p = (lh)->n = (el); (el)->p = (lh); (el); }) + +/* adds an element at the end of a list ; returns the element */ +#define LIST_APPEND(lh, el) ({ (el)->p = (lh)->p; (el)->p->n = (lh)->p = (el); (el)->n = (lh); (el); }) + +/* adds the contents of a list <old> at the beginning of another list <new>. The old list head remains untouched. */ +#define LIST_SPLICE(new, old) do { \ + if (!LIST_ISEMPTY(old)) { \ + (old)->p->n = (new)->n; (old)->n->p = (new); \ + (new)->n->p = (old)->p; (new)->n = (old)->n; \ + } \ + } while (0) + +/* adds the contents of a list whose first element is <old> and last one is + * <old->prev> at the end of another list <new>. The old list DOES NOT have + * any head here. + */ +#define LIST_SPLICE_END_DETACHED(new, old) do { \ + typeof(new) __t; \ + (new)->p->n = (old); \ + (old)->p->n = (new); \ + __t = (old)->p; \ + (old)->p = (new)->p; \ + (new)->p = __t; \ + } while (0) + +/* removes an element from a list and returns it */ +#if defined(DEBUG_LIST) +/* purposely corrupt the detached element to detect use-after-delete */ +#define LIST_DELETE(el) ({ typeof(el) __ret = (el); (el)->n->p = (el)->p; (el)->p->n = (el)->n; *(__ret) = (struct list)ILH; (__ret);}) +#else +#define LIST_DELETE(el) ({ typeof(el) __ret = (el); (el)->n->p = (el)->p; (el)->p->n = (el)->n; (__ret); }) +#endif + +/* removes an element from a list, initializes it and returns it. + * This is faster than LIST_DELETE+LIST_INIT as we avoid reloading the pointers. + */ +#define LIST_DEL_INIT(el) ({ \ + typeof(el) __ret = (el); \ + typeof(__ret->n) __n = __ret->n; \ + typeof(__ret->p) __p = __ret->p; \ + __n->p = __p; __p->n = __n; \ + __ret->n = __ret->p = __ret; \ + __ret; \ +}) + +/* returns a pointer of type <pt> to a structure containing a list head called + * <el> at address <lh>. Note that <lh> can be the result of a function or macro + * since it's used only once. + * Example: LIST_ELEM(cur_node->args.next, struct node *, args) + */ +#define LIST_ELEM(lh, pt, el) ((pt)(((const char *)(lh)) - ((size_t)&((pt)NULL)->el))) + +/* checks if the list head <lh> is empty or not */ +#define LIST_ISEMPTY(lh) ((lh)->n == (lh)) + +/* checks if the list element <el> was added to a list or not. This only + * works when detached elements are reinitialized (using LIST_DEL_INIT) + */ +#define LIST_INLIST(el) ((el)->n != (el)) + +/* atomically checks if the list element's next pointer points to anything + * different from itself, implying the element should be part of a list. This + * usually is similar to LIST_INLIST() except that while that one might be + * instrumented using debugging code to perform further consistency checks, + * the macro below guarantees to always perform a single atomic test and is + * safe to use with barriers. + */ +#define LIST_INLIST_ATOMIC(el) ({ \ + typeof(el) __ptr = (el); \ + HA_ATOMIC_LOAD(&(__ptr)->n) != __ptr; \ +}) + +/* returns a pointer of type <pt> to a structure following the element + * which contains list head <lh>, which is known as element <el> in + * struct pt. + * Example: LIST_NEXT(args, struct node *, list) + */ +#define LIST_NEXT(lh, pt, el) (LIST_ELEM((lh)->n, pt, el)) + + +/* returns a pointer of type <pt> to a structure preceding the element + * which contains list head <lh>, which is known as element <el> in + * struct pt. + */ +#undef LIST_PREV +#define LIST_PREV(lh, pt, el) (LIST_ELEM((lh)->p, pt, el)) + +/* + * Simpler FOREACH_ITEM macro inspired from Linux sources. + * Iterates <item> through a list of items of type "typeof(*item)" which are + * linked via a "struct list" member named <member>. A pointer to the head of + * the list is passed in <list_head>. No temporary variable is needed. Note + * that <item> must not be modified during the loop. + * Example: list_for_each_entry(cur_acl, known_acl, list) { ... }; + */ +#define list_for_each_entry(item, list_head, member) \ + for (item = LIST_ELEM((list_head)->n, typeof(item), member); \ + &item->member != (list_head); \ + item = LIST_ELEM(item->member.n, typeof(item), member)) + +/* + * Same as list_for_each_entry but starting from current point + * Iterates <item> through the list starting from <item> + * It's basically the same macro but without initializing item to the head of + * the list. + */ +#define list_for_each_entry_from(item, list_head, member) \ + for ( ; &item->member != (list_head); \ + item = LIST_ELEM(item->member.n, typeof(item), member)) + +/* + * Simpler FOREACH_ITEM_SAFE macro inspired from Linux sources. + * Iterates <item> through a list of items of type "typeof(*item)" which are + * linked via a "struct list" member named <member>. A pointer to the head of + * the list is passed in <list_head>. A temporary variable <back> of same type + * as <item> is needed so that <item> may safely be deleted if needed. + * Example: list_for_each_entry_safe(cur_acl, tmp, known_acl, list) { ... }; + */ +#define list_for_each_entry_safe(item, back, list_head, member) \ + for (item = LIST_ELEM((list_head)->n, typeof(item), member), \ + back = LIST_ELEM(item->member.n, typeof(item), member); \ + &item->member != (list_head); \ + item = back, back = LIST_ELEM(back->member.n, typeof(back), member)) + + +/* + * Same as list_for_each_entry_safe but starting from current point + * Iterates <item> through the list starting from <item> + * It's basically the same macro but without initializing item to the head of + * the list. + */ +#define list_for_each_entry_safe_from(item, back, list_head, member) \ + for (back = LIST_ELEM(item->member.n, typeof(item), member); \ + &item->member != (list_head); \ + item = back, back = LIST_ELEM(back->member.n, typeof(back), member)) + +/* + * Iterate backwards <item> through a list of items of type "typeof(*item)" + * which are linked via a "struct list" member named <member>. A pointer to + * the head of the list is passed in <list_head>. No temporary variable is + * needed. Note that <item> must not be modified during the loop. + * Example: list_for_each_entry_rev(cur_acl, known_acl, list) { ... }; + */ +#define list_for_each_entry_rev(item, list_head, member) \ + for (item = LIST_ELEM((list_head)->p, typeof(item), member); \ + &item->member != (list_head); \ + item = LIST_ELEM(item->member.p, typeof(item), member)) + +/* + * Same as list_for_each_entry_rev but starting from current point + * Iterate backwards <item> through the list starting from <item> + * It's basically the same macro but without initializing item to the head of + * the list. + */ +#define list_for_each_entry_from_rev(item, list_head, member) \ + for ( ; &item->member != (list_head); \ + item = LIST_ELEM(item->member.p, typeof(item), member)) + +/* + * Iterate backwards <item> through a list of items of type "typeof(*item)" + * which are linked via a "struct list" member named <member>. A pointer to + * the head of the list is passed in <list_head>. A temporary variable <back> + * of same type as <item> is needed so that <item> may safely be deleted + * if needed. + * Example: list_for_each_entry_safe_rev(cur_acl, tmp, known_acl, list) { ... }; + */ +#define list_for_each_entry_safe_rev(item, back, list_head, member) \ + for (item = LIST_ELEM((list_head)->p, typeof(item), member), \ + back = LIST_ELEM(item->member.p, typeof(item), member); \ + &item->member != (list_head); \ + item = back, back = LIST_ELEM(back->member.p, typeof(back), member)) + +/* + * Same as list_for_each_entry_safe_rev but starting from current point + * Iterate backwards <item> through the list starting from <item> + * It's basically the same macro but without initializing item to the head of + * the list. + */ +#define list_for_each_entry_safe_from_rev(item, back, list_head, member) \ + for (back = LIST_ELEM(item->member.p, typeof(item), member); \ + &item->member != (list_head); \ + item = back, back = LIST_ELEM(back->member.p, typeof(back), member)) + + +/* + * Locked version of list manipulation macros. + * It is OK to use those concurrently from multiple threads, as long as the + * list is only used with the locked variants. + */ +#define MT_LIST_BUSY ((struct mt_list *)1) + +/* + * Add an item at the beginning of a list. + * Returns 1 if we added the item, 0 otherwise (because it was already in a + * list). + */ +#define MT_LIST_TRY_INSERT(_lh, _el) \ + ({ \ + int _ret = 0; \ + struct mt_list *lh = (_lh), *el = (_el); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n, *n2; \ + struct mt_list *p, *p2; \ + n = _HA_ATOMIC_XCHG(&(lh)->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) \ + continue; \ + p = _HA_ATOMIC_XCHG(&n->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) { \ + (lh)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + n2 = _HA_ATOMIC_XCHG(&el->next, MT_LIST_BUSY); \ + if (n2 != el) { /* element already linked */ \ + if (n2 != MT_LIST_BUSY) \ + el->next = n2; \ + n->prev = p; \ + __ha_barrier_store(); \ + lh->next = n; \ + __ha_barrier_store(); \ + if (n2 == MT_LIST_BUSY) \ + continue; \ + break; \ + } \ + p2 = _HA_ATOMIC_XCHG(&el->prev, MT_LIST_BUSY); \ + if (p2 != el) { \ + if (p2 != MT_LIST_BUSY) \ + el->prev = p2; \ + n->prev = p; \ + el->next = el; \ + __ha_barrier_store(); \ + lh->next = n; \ + __ha_barrier_store(); \ + if (p2 == MT_LIST_BUSY) \ + continue; \ + break; \ + } \ + (el)->next = n; \ + (el)->prev = p; \ + __ha_barrier_store(); \ + n->prev = (el); \ + __ha_barrier_store(); \ + p->next = (el); \ + __ha_barrier_store(); \ + _ret = 1; \ + break; \ + } \ + (_ret); \ + }) + +/* + * Add an item at the end of a list. + * Returns 1 if we added the item, 0 otherwise (because it was already in a + * list). + */ +#define MT_LIST_TRY_APPEND(_lh, _el) \ + ({ \ + int _ret = 0; \ + struct mt_list *lh = (_lh), *el = (_el); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n, *n2; \ + struct mt_list *p, *p2; \ + p = _HA_ATOMIC_XCHG(&(lh)->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) \ + continue; \ + n = _HA_ATOMIC_XCHG(&p->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) { \ + (lh)->prev = p; \ + __ha_barrier_store(); \ + continue; \ + } \ + p2 = _HA_ATOMIC_XCHG(&el->prev, MT_LIST_BUSY); \ + if (p2 != el) { \ + if (p2 != MT_LIST_BUSY) \ + el->prev = p2; \ + p->next = n; \ + __ha_barrier_store(); \ + lh->prev = p; \ + __ha_barrier_store(); \ + if (p2 == MT_LIST_BUSY) \ + continue; \ + break; \ + } \ + n2 = _HA_ATOMIC_XCHG(&el->next, MT_LIST_BUSY); \ + if (n2 != el) { /* element already linked */ \ + if (n2 != MT_LIST_BUSY) \ + el->next = n2; \ + p->next = n; \ + el->prev = el; \ + __ha_barrier_store(); \ + lh->prev = p; \ + __ha_barrier_store(); \ + if (n2 == MT_LIST_BUSY) \ + continue; \ + break; \ + } \ + (el)->next = n; \ + (el)->prev = p; \ + __ha_barrier_store(); \ + p->next = (el); \ + __ha_barrier_store(); \ + n->prev = (el); \ + __ha_barrier_store(); \ + _ret = 1; \ + break; \ + } \ + (_ret); \ + }) + +/* + * Add an item at the beginning of a list. + * It is assumed the element can't already be in a list, so it isn't checked. + */ +#define MT_LIST_INSERT(_lh, _el) \ + ({ \ + int _ret = 0; \ + struct mt_list *lh = (_lh), *el = (_el); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n; \ + struct mt_list *p; \ + n = _HA_ATOMIC_XCHG(&(lh)->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) \ + continue; \ + p = _HA_ATOMIC_XCHG(&n->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) { \ + (lh)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + (el)->next = n; \ + (el)->prev = p; \ + __ha_barrier_store(); \ + n->prev = (el); \ + __ha_barrier_store(); \ + p->next = (el); \ + __ha_barrier_store(); \ + _ret = 1; \ + break; \ + } \ + (_ret); \ + }) + +/* + * Add an item at the end of a list. + * It is assumed the element can't already be in a list, so it isn't checked + */ +#define MT_LIST_APPEND(_lh, _el) \ + ({ \ + int _ret = 0; \ + struct mt_list *lh = (_lh), *el = (_el); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n; \ + struct mt_list *p; \ + p = _HA_ATOMIC_XCHG(&(lh)->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) \ + continue; \ + n = _HA_ATOMIC_XCHG(&p->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) { \ + (lh)->prev = p; \ + __ha_barrier_store(); \ + continue; \ + } \ + (el)->next = n; \ + (el)->prev = p; \ + __ha_barrier_store(); \ + p->next = (el); \ + __ha_barrier_store(); \ + n->prev = (el); \ + __ha_barrier_store(); \ + _ret = 1; \ + break; \ + } \ + (_ret); \ + }) + +/* + * Add an item at the end of a list. + * It is assumed the element can't already be in a list, so it isn't checked + * Item will be added in busy/locked state, so that it is already + * referenced in the list but no other thread can use it until we're ready. + * + * This returns a struct mt_list, that will be needed at unlock time. + * (using MT_LIST_UNLOCK_ELT) + */ +#define MT_LIST_APPEND_LOCKED(_lh, _el) \ + ({ \ + struct mt_list np; \ + struct mt_list *lh = (_lh), *el = (_el); \ + (el)->next = MT_LIST_BUSY; \ + (el)->prev = MT_LIST_BUSY; \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n; \ + struct mt_list *p; \ + p = _HA_ATOMIC_XCHG(&(lh)->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) \ + continue; \ + n = _HA_ATOMIC_XCHG(&p->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) { \ + (lh)->prev = p; \ + __ha_barrier_store(); \ + continue; \ + } \ + np.prev = p; \ + np.next = n; \ + break; \ + } \ + (np); \ + }) + +/* + * Detach a list from its head. A pointer to the first element is returned + * and the list is closed. If the list was empty, NULL is returned. This may + * exclusively be used with lists modified by MT_LIST_TRY_INSERT/MT_LIST_TRY_APPEND. This + * is incompatible with MT_LIST_DELETE run concurrently. + * If there's at least one element, the next of the last element will always + * be NULL. + */ +#define MT_LIST_BEHEAD(_lh) ({ \ + struct mt_list *lh = (_lh); \ + struct mt_list *_n; \ + struct mt_list *_p; \ + for (;;__ha_cpu_relax()) { \ + _p = _HA_ATOMIC_XCHG(&(lh)->prev, MT_LIST_BUSY); \ + if (_p == MT_LIST_BUSY) \ + continue; \ + if (_p == (lh)) { \ + (lh)->prev = _p; \ + __ha_barrier_store(); \ + _n = NULL; \ + break; \ + } \ + _n = _HA_ATOMIC_XCHG(&(lh)->next, MT_LIST_BUSY); \ + if (_n == MT_LIST_BUSY) { \ + (lh)->prev = _p; \ + __ha_barrier_store(); \ + continue; \ + } \ + if (_n == (lh)) { \ + (lh)->next = _n; \ + (lh)->prev = _p; \ + __ha_barrier_store(); \ + _n = NULL; \ + break; \ + } \ + (lh)->next = (lh); \ + (lh)->prev = (lh); \ + __ha_barrier_store(); \ + _n->prev = _p; \ + __ha_barrier_store(); \ + _p->next = NULL; \ + __ha_barrier_store(); \ + break; \ + } \ + (_n); \ +}) + + +/* Remove an item from a list. + * Returns 1 if we removed the item, 0 otherwise (because it was in no list). + */ +#define MT_LIST_DELETE(_el) \ + ({ \ + int _ret = 0; \ + struct mt_list *el = (_el); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n, *n2; \ + struct mt_list *p, *p2 = NULL; \ + n = _HA_ATOMIC_XCHG(&(el)->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) \ + continue; \ + p = _HA_ATOMIC_XCHG(&(el)->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) { \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + if (p != (el)) { \ + p2 = _HA_ATOMIC_XCHG(&p->next, MT_LIST_BUSY); \ + if (p2 == MT_LIST_BUSY) { \ + (el)->prev = p; \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + } \ + if (n != (el)) { \ + n2 = _HA_ATOMIC_XCHG(&n->prev, MT_LIST_BUSY); \ + if (n2 == MT_LIST_BUSY) { \ + if (p2 != NULL) \ + p->next = p2; \ + (el)->prev = p; \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + } \ + n->prev = p; \ + p->next = n; \ + if (p != (el) && n != (el)) \ + _ret = 1; \ + __ha_barrier_store(); \ + (el)->prev = (el); \ + (el)->next = (el); \ + __ha_barrier_store(); \ + break; \ + } \ + (_ret); \ + }) + + +/* Remove the first element from the list, and return it */ +#define MT_LIST_POP(_lh, pt, el) \ + ({ \ + void *_ret; \ + struct mt_list *lh = (_lh); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n, *n2; \ + struct mt_list *p, *p2; \ + n = _HA_ATOMIC_XCHG(&(lh)->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) \ + continue; \ + if (n == (lh)) { \ + (lh)->next = lh; \ + __ha_barrier_store(); \ + _ret = NULL; \ + break; \ + } \ + p = _HA_ATOMIC_XCHG(&n->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) { \ + (lh)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + n2 = _HA_ATOMIC_XCHG(&n->next, MT_LIST_BUSY); \ + if (n2 == MT_LIST_BUSY) { \ + n->prev = p; \ + __ha_barrier_store(); \ + (lh)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + p2 = _HA_ATOMIC_XCHG(&n2->prev, MT_LIST_BUSY); \ + if (p2 == MT_LIST_BUSY) { \ + n->next = n2; \ + n->prev = p; \ + __ha_barrier_store(); \ + (lh)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + (lh)->next = n2; \ + (n2)->prev = (lh); \ + __ha_barrier_store(); \ + (n)->prev = (n); \ + (n)->next = (n); \ + __ha_barrier_store(); \ + _ret = MT_LIST_ELEM(n, pt, el); \ + break; \ + } \ + (_ret); \ + }) + +#define MT_LIST_HEAD(a) ((void *)(&(a))) + +#define MT_LIST_INIT(l) ((l)->next = (l)->prev = (l)) + +#define MT_LIST_HEAD_INIT(l) { &l, &l } +/* returns a pointer of type <pt> to a structure containing a list head called + * <el> at address <lh>. Note that <lh> can be the result of a function or macro + * since it's used only once. + * Example: MT_LIST_ELEM(cur_node->args.next, struct node *, args) + */ +#define MT_LIST_ELEM(lh, pt, el) ((pt)(((const char *)(lh)) - ((size_t)&((pt)NULL)->el))) + +/* checks if the list head <lh> is empty or not */ +#define MT_LIST_ISEMPTY(lh) ((lh)->next == (lh)) + +/* returns a pointer of type <pt> to a structure following the element + * which contains list head <lh>, which is known as element <el> in + * struct pt. + * Example: MT_LIST_NEXT(args, struct node *, list) + */ +#define MT_LIST_NEXT(lh, pt, el) (MT_LIST_ELEM((lh)->next, pt, el)) + + +/* returns a pointer of type <pt> to a structure preceding the element + * which contains list head <lh>, which is known as element <el> in + * struct pt. + */ +#undef MT_LIST_PREV +#define MT_LIST_PREV(lh, pt, el) (MT_LIST_ELEM((lh)->prev, pt, el)) + +/* checks if the list element <el> was added to a list or not. This only + * works when detached elements are reinitialized (using LIST_DEL_INIT) + */ +#define MT_LIST_INLIST(el) ((el)->next != (el)) + +/* Lock an element in the list, to be sure it won't be removed nor + * accessed by another thread while the lock is held. + * Locking behavior is inspired from MT_LIST_DELETE macro, + * thus this macro can safely be used concurrently with MT_LIST_DELETE. + * This returns a struct mt_list, that will be needed at unlock time. + * (using MT_LIST_UNLOCK_ELT) + */ +#define MT_LIST_LOCK_ELT(_el) \ + ({ \ + struct mt_list ret; \ + struct mt_list *el = (_el); \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n, *n2; \ + struct mt_list *p, *p2 = NULL; \ + n = _HA_ATOMIC_XCHG(&(el)->next, MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) \ + continue; \ + p = _HA_ATOMIC_XCHG(&(el)->prev, MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) { \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + if (p != (el)) { \ + p2 = _HA_ATOMIC_XCHG(&p->next, MT_LIST_BUSY);\ + if (p2 == MT_LIST_BUSY) { \ + (el)->prev = p; \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + } \ + if (n != (el)) { \ + n2 = _HA_ATOMIC_XCHG(&n->prev, MT_LIST_BUSY);\ + if (n2 == MT_LIST_BUSY) { \ + if (p2 != NULL) \ + p->next = p2; \ + (el)->prev = p; \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + } \ + ret.next = n; \ + ret.prev = p; \ + break; \ + } \ + ret; \ + }) + +/* Unlock an element previously locked by MT_LIST_LOCK_ELT. "np" is the + * struct mt_list returned by MT_LIST_LOCK_ELT(). + */ +#define MT_LIST_UNLOCK_ELT(_el, np) \ + do { \ + struct mt_list *n = (np).next, *p = (np).prev; \ + struct mt_list *el = (_el); \ + (el)->next = n; \ + (el)->prev = p; \ + if (n != (el)) \ + n->prev = (el); \ + if (p != (el)) \ + p->next = (el); \ + } while (0) + +/* Internal macroes for the foreach macroes */ +#define _MT_LIST_UNLOCK_NEXT(el, np) \ + do { \ + struct mt_list *n = (np); \ + (el)->next = n; \ + if (n != (el)) \ + n->prev = (el); \ + } while (0) + +/* Internal macroes for the foreach macroes */ +#define _MT_LIST_UNLOCK_PREV(el, np) \ + do { \ + struct mt_list *p = (np); \ + (el)->prev = p; \ + if (p != (el)) \ + p->next = (el); \ + } while (0) + +#define _MT_LIST_LOCK_NEXT(el) \ + ({ \ + struct mt_list *n = NULL; \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *n2; \ + n = _HA_ATOMIC_XCHG(&((el)->next), MT_LIST_BUSY); \ + if (n == MT_LIST_BUSY) \ + continue; \ + if (n != (el)) { \ + n2 = _HA_ATOMIC_XCHG(&n->prev, MT_LIST_BUSY);\ + if (n2 == MT_LIST_BUSY) { \ + (el)->next = n; \ + __ha_barrier_store(); \ + continue; \ + } \ + } \ + break; \ + } \ + n; \ + }) + +#define _MT_LIST_LOCK_PREV(el) \ + ({ \ + struct mt_list *p = NULL; \ + for (;;__ha_cpu_relax()) { \ + struct mt_list *p2; \ + p = _HA_ATOMIC_XCHG(&((el)->prev), MT_LIST_BUSY); \ + if (p == MT_LIST_BUSY) \ + continue; \ + if (p != (el)) { \ + p2 = _HA_ATOMIC_XCHG(&p->next, MT_LIST_BUSY);\ + if (p2 == MT_LIST_BUSY) { \ + (el)->prev = p; \ + __ha_barrier_store(); \ + continue; \ + } \ + } \ + break; \ + } \ + p; \ + }) + +#define _MT_LIST_RELINK_DELETED(elt2) \ + do { \ + struct mt_list *n = elt2.next, *p = elt2.prev; \ + ALREADY_CHECKED(p); \ + n->prev = p; \ + p->next = n; \ + } while (0); + +/* Equivalent of MT_LIST_DELETE(), to be used when parsing the list with mt_list_entry_for_each_safe(). + * It should be the element currently parsed (tmpelt1) + */ +#define MT_LIST_DELETE_SAFE(_el) \ + do { \ + struct mt_list *el = (_el); \ + (el)->prev = (el); \ + (el)->next = (el); \ + (_el) = NULL; \ + } while (0) + +/* Safe as MT_LIST_DELETE_SAFE, but it won't reinit the element */ +#define MT_LIST_DELETE_SAFE_NOINIT(_el) \ + do { \ + (_el) = NULL; \ + } while (0) + +/* Iterates <item> through a list of items of type "typeof(*item)" which are + * linked via a "struct mt_list" member named <member>. A pointer to the head + * of the list is passed in <list_head>. + * + * <tmpelt> is a temporary struct mt_list *, and <tmpelt2> is a temporary + * struct mt_list, used internally, both are needed for MT_LIST_DELETE_SAFE. + * + * This macro is implemented using a nested loop. The inner loop will run for + * each element in the list, and the upper loop will run only once to do some + * cleanup when the end of the list is reached or user breaks from inner loop. + * It's safe to break from this macro as the cleanup will be performed anyway, + * but it is strictly forbidden to goto from the loop because skipping the + * cleanup will lead to undefined behavior. + * + * In order to remove the current element, please use MT_LIST_DELETE_SAFE. + * + * Example: + * mt_list_for_each_entry_safe(item, list_head, list_member, elt1, elt2) { + * ... + * } + */ +#define mt_list_for_each_entry_safe(item, list_head, member, tmpelt, tmpelt2) \ + for ((tmpelt) = NULL; (tmpelt) != MT_LIST_BUSY; ({ \ + /* post loop cleanup: \ + * gets executed only once to perform cleanup \ + * after child loop has finished \ + */ \ + if (tmpelt) { \ + /* last elem still exists, unlocking it */ \ + if (tmpelt2.prev) \ + MT_LIST_UNLOCK_ELT(tmpelt, tmpelt2); \ + else { \ + /* special case: child loop did not run \ + * so tmpelt2.prev == NULL \ + * (empty list) \ + */ \ + _MT_LIST_UNLOCK_NEXT(tmpelt, tmpelt2.next); \ + } \ + } else { \ + /* last elem was deleted by user, relink required: \ + * prev->next = next \ + * next->prev = prev \ + */ \ + _MT_LIST_RELINK_DELETED(tmpelt2); \ + } \ + /* break parent loop \ + * (this loop runs exactly one time) \ + */ \ + (tmpelt) = MT_LIST_BUSY; \ + })) \ + for ((tmpelt) = (list_head), (tmpelt2).prev = NULL, (tmpelt2).next = _MT_LIST_LOCK_NEXT(tmpelt); ({ \ + /* this gets executed before each user body loop */ \ + (item) = MT_LIST_ELEM((tmpelt2.next), typeof(item), member); \ + if (&item->member != (list_head)) { \ + /* did not reach end of list \ + * (back to list_head == end of list reached) \ + */ \ + if (tmpelt2.prev != &item->member) \ + tmpelt2.next = _MT_LIST_LOCK_NEXT(&item->member); \ + else { \ + /* FIXME: is this even supposed to happen?? \ + * I'm not understanding how \ + * tmpelt2.prev could be equal to &item->member. \ + * running 'test_list' multiple times with 8 \ + * concurrent threads: this never gets reached \ + */ \ + tmpelt2.next = tmpelt; \ + } \ + if (tmpelt != NULL) { \ + /* if tmpelt was not deleted by user */ \ + if (tmpelt2.prev) { \ + /* not executed on first run \ + * (tmpelt2.prev == NULL on first run) \ + */ \ + _MT_LIST_UNLOCK_PREV(tmpelt, tmpelt2.prev); \ + /* unlock_prev will implicitly relink: \ + * elt->prev = prev \ + * prev->next = elt \ + */ \ + } \ + tmpelt2.prev = tmpelt; \ + } \ + (tmpelt) = &item->member; \ + } \ + /* else: end of list reached (loop stop cond) */ \ + }), \ + &item->member != (list_head);) + +static __inline struct list *mt_list_to_list(struct mt_list *list) +{ + union { + struct mt_list *mt_list; + struct list *list; + } mylist; + + mylist.mt_list = list; + return mylist.list; +} + +static __inline struct mt_list *list_to_mt_list(struct list *list) +{ + union { + struct mt_list *mt_list; + struct list *list; + } mylist; + + mylist.list = list; + return mylist.mt_list; + +} + +#endif /* _HAPROXY_LIST_H */ diff --git a/include/haproxy/listener-t.h b/include/haproxy/listener-t.h new file mode 100644 index 0000000..7f5e52a --- /dev/null +++ b/include/haproxy/listener-t.h @@ -0,0 +1,317 @@ +/* + * include/haproxy/listener-t.h + * This file defines the structures needed to manage listeners. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LISTENER_T_H +#define _HAPROXY_LISTENER_T_H + +#include <sys/types.h> +#include <sys/socket.h> + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/quic_cc-t.h> +#include <haproxy/quic_sock-t.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/receiver-t.h> +#include <haproxy/stats-t.h> +#include <haproxy/thread.h> + +/* Some pointer types reference below */ +struct task; +struct protocol; +struct xprt_ops; +struct proxy; +struct fe_counters; +struct connection; + +/* listener state */ +enum li_state { + LI_NEW = 0, /* not initialized yet */ + LI_INIT, /* all parameters filled in, but not assigned yet */ + LI_ASSIGNED, /* assigned to the protocol, but not listening yet */ + LI_PAUSED, /* listener was paused, it's bound but not listening */ + LI_LISTEN, /* started, listening but not enabled */ + LI_READY, /* started, listening and enabled */ + LI_FULL, /* reached its connection limit */ + LI_LIMITED, /* transient state: limits have been reached, listener is queued */ +} __attribute__((packed)); + +/* Listener transitions + * calloc() set() add_listener() bind() + * -------> NEW ----> INIT ----------> ASSIGNED -----> LISTEN + * <------- <---- <---------- <----- + * free() bzero() del_listener() unbind() + * + * The file descriptor is valid only during these three states : + * + * disable() + * LISTEN <------------ READY + * A| ------------> |A + * || !max & enable() || + * || || + * || max || + * || max & enable() V| !max + * |+---------------> FULL + * +----------------- + * disable() + * + * The LIMITED state my be used when a limit has been detected just before + * using a listener. In this case, the listener MUST be queued into the + * appropriate wait queue (either the proxy's or the global one). It may be + * set back to the READY state at any instant and for any reason, so one must + * not rely on this state. + */ + +/* listener status for stats */ +enum li_status { + LI_STATUS_WAITING = 0, + LI_STATUS_OPEN, + LI_STATUS_FULL, + + LI_STATE_COUNT /* must be last */ +}; + +/* Note: if a bind_conf uses BC_O_UNLIMITED, it is highly recommended that it adds its own + * maxconn setting to the global.maxsock value so that its resources are reserved. + */ + +/* flags used with bind_conf->options */ +#define BC_O_USE_SSL 0x00000001 /* SSL is being used on this bind_conf */ +#define BC_O_GENERATE_CERTS 0x00000002 /* 1 if generate-certificates option is set, else 0 */ +#define BC_O_QUIC_FORCE_RETRY 0x00000004 /* always send Retry on reception of Initial without token */ +#define BC_O_USE_SOCK_DGRAM 0x00000008 /* at least one datagram-type listener is used */ +#define BC_O_USE_SOCK_STREAM 0x00000010 /* at least one stream-type listener is used */ +#define BC_O_USE_XPRT_DGRAM 0x00000020 /* at least one dgram-only xprt listener is used */ +#define BC_O_USE_XPRT_STREAM 0x00000040 /* at least one stream-only xprt listener is used */ +#define BC_O_NOLINGER 0x00000080 /* disable lingering on these listeners */ +#define BC_O_NOQUICKACK 0x00000100 /* disable quick ack of immediate data (linux) */ +#define BC_O_DEF_ACCEPT 0x00000200 /* wait up to 1 second for data before accepting */ +#define BC_O_TCP_FO 0x00000400 /* enable TCP Fast Open (linux >= 3.7) */ +#define BC_O_ACC_PROXY 0x00000800 /* find the proxied address in the first request line */ +#define BC_O_ACC_CIP 0x00001000 /* find the proxied address in the NetScaler Client IP header */ +#define BC_O_UNLIMITED 0x00002000 /* listeners not subject to global limits (peers & stats socket) */ +#define BC_O_NOSTOP 0x00004000 /* keep the listeners active even after a soft stop */ +#define BC_O_REVERSE_HTTP 0x00008000 /* a reverse HTTP bind is used */ +#define BC_O_XPRT_MAXCONN 0x00010000 /* transport layer allocates its own resource prior to accept and is responsible to check maxconn limit */ + + +/* flags used with bind_conf->ssl_options */ +#ifdef USE_OPENSSL +#define BC_SSL_O_NONE 0x0000 +#define BC_SSL_O_NO_TLS_TICKETS 0x0100 /* disable session resumption tickets */ +#define BC_SSL_O_PREF_CLIE_CIPH 0x0200 /* prefer client ciphers */ +#endif + +struct tls_version_filter { + uint16_t flags; /* ssl options */ + uint8_t min; /* min TLS version */ + uint8_t max; /* max TLS version */ +}; + +/* ssl "bind" settings */ +struct ssl_bind_conf { +#ifdef USE_OPENSSL + char *npn_str; /* NPN protocol string */ + int npn_len; /* NPN protocol string length */ + char *alpn_str; /* ALPN protocol string */ + int alpn_len; /* ALPN protocol string length */ + unsigned int verify:3; /* verify method (set of SSL_VERIFY_* flags) */ + unsigned int no_ca_names:1;/* do not send ca names to clients (ca_file related) */ + unsigned int early_data:1; /* early data allowed */ + unsigned int ocsp_update:2;/* enable OCSP auto update */ + char *ca_file; /* CAfile to use on verify and ca-names */ + char *ca_verify_file; /* CAverify file to use on verify only */ + char *crl_file; /* CRLfile to use on verify */ + char *ciphers; /* cipher suite to use if non-null */ + char *ciphersuites; /* TLS 1.3 cipher suite to use if non-null */ + char *curves; /* curves suite to use for ECDHE */ + char *ecdhe; /* named curve to use for ECDHE */ + char *sigalgs; /* Signature algorithms */ + char *client_sigalgs; /* Client Signature algorithms */ + struct tls_version_filter ssl_methods_cfg; /* original ssl methods found in configuration */ + struct tls_version_filter ssl_methods; /* actual ssl methods used at runtime */ +#endif +}; + +/* + * In OpenSSL 3.0.0, the biggest verify error code's value is 94 and on the + * latest 1.1.1 it already reaches 79 so we need to size the ca/crt-ignore-err + * arrays accordingly. If the max error code increases, the arrays might need to + * be resized. + */ +#define SSL_MAX_VFY_ERROR_CODE 94 +#define IGNERR_BF_SIZE ((SSL_MAX_VFY_ERROR_CODE >> 6) + 1) + +/* "bind" line settings */ +struct bind_conf { +#ifdef USE_OPENSSL + struct ssl_bind_conf ssl_conf; /* ssl conf for ctx setting */ + unsigned long long ca_ignerr_bitfield[IGNERR_BF_SIZE]; /* ignored verify errors in handshake if depth > 0 */ + unsigned long long crt_ignerr_bitfield[IGNERR_BF_SIZE]; /* ignored verify errors in handshake if depth == 0 */ + void *initial_ctx; /* SSL context for initial negotiation */ + void *default_ctx; /* SSL context of first/default certificate */ + struct ckch_inst *default_inst; + struct ssl_bind_conf *default_ssl_conf; /* custom SSL conf of default_ctx */ + int strict_sni; /* refuse negotiation if sni doesn't match a certificate */ + int ssl_options; /* ssl options */ + struct eb_root sni_ctx; /* sni_ctx tree of all known certs full-names sorted by name */ + struct eb_root sni_w_ctx; /* sni_ctx tree of all known certs wildcards sorted by name */ + struct tls_keys_ref *keys_ref; /* TLS ticket keys reference */ + + char *ca_sign_file; /* CAFile used to generate and sign server certificates */ + char *ca_sign_pass; /* CAKey passphrase */ + + struct ckch_data *ca_sign_ckch; /* CA and possible certificate chain for ca generation */ +#endif +#ifdef USE_QUIC + struct quic_transport_params quic_params; /* QUIC transport parameters. */ + struct quic_cc_algo *quic_cc_algo; /* QUIC control congestion algorithm */ + size_t max_cwnd; /* QUIC maximumu congestion control window size (kB) */ + enum quic_sock_mode quic_mode; /* QUIC socket allocation strategy */ +#endif + struct proxy *frontend; /* the frontend all these listeners belong to, or NULL */ + const struct mux_proto_list *mux_proto; /* the mux to use for all incoming connections (specified by the "proto" keyword) */ + struct xprt_ops *xprt; /* transport-layer operations for all listeners */ + uint options; /* set of BC_O_* flags */ + unsigned int analysers; /* bitmap of required protocol analysers */ + int maxseg; /* for TCP, advertised MSS */ + int tcp_ut; /* for TCP, user timeout */ + int maxaccept; /* if set, max number of connections accepted at once (-1 when disabled) */ + unsigned int backlog; /* if set, listen backlog */ + int maxconn; /* maximum connections allowed on this listener */ + int (*accept)(struct connection *conn); /* upper layer's accept() */ + int level; /* stats access level (ACCESS_LVL_*) */ + int severity_output; /* default severity output format in cli feedback messages */ + short int nice; /* nice value to assign to the instantiated tasks */ + /* 2-byte hole here */ + struct list listeners; /* list of listeners using this bind config */ + uint32_t ns_cip_magic; /* Excepted NetScaler Client IP magic number */ + struct list by_fe; /* next binding for the same frontend, or NULL */ + char *arg; /* argument passed to "bind" for better error reporting */ + char *file; /* file where the section appears */ + int line; /* line where the section appears */ + char *rhttp_srvname; /* name of server when using "rhttp@" address */ + int rhttp_nbconn; /* count of connections to initiate in parallel */ + __decl_thread(HA_RWLOCK_T sni_lock); /* lock the SNI trees during add/del operations */ + struct thread_set thread_set; /* entire set of the allowed threads (0=no restriction) */ + struct rx_settings settings; /* all the settings needed for the listening socket */ +}; + +/* Fields of a listener allocated per thread */ +struct li_per_thread { + struct { + struct mt_list list; /* list element in the QUIC accept queue */ + struct mt_list conns; /* list of QUIC connections from this listener ready to be accepted */ + } quic_accept; + + struct listener *li; /* back reference on the listener */ +}; + + +/* The listener will be directly referenced by the fdtab[] which holds its + * socket. The listener provides the protocol-specific accept() function to + * the fdtab. + */ +struct listener { + enum obj_type obj_type; /* object type = OBJ_TYPE_LISTENER */ + enum li_state state; /* state: NEW, INIT, ASSIGNED, LISTEN, READY, FULL */ + uint16_t flags; /* listener flags: LI_F_* */ + int luid; /* listener universally unique ID, used for SNMP */ + int nbconn; /* current number of connections on this listener */ + unsigned long thr_idx; /* thread indexes for queue distribution (see listener_accept()) */ + __decl_thread(HA_RWLOCK_T lock); + + struct fe_counters *counters; /* statistics counters */ + struct mt_list wait_queue; /* link element to make the listener wait for something (LI_LIMITED) */ + char *name; /* listener's name */ + + unsigned int thr_conn[MAX_THREADS_PER_GROUP]; /* number of connections per thread for the group */ + + struct list by_fe; /* chaining in frontend's list of listeners */ + struct list by_bind; /* chaining in bind_conf's list of listeners */ + struct bind_conf *bind_conf; /* "bind" line settings, include SSL settings among other things */ + struct receiver rx; /* network receiver parts */ + struct { + struct eb32_node id; /* place in the tree of used IDs */ + } conf; /* config information */ + + struct li_per_thread *per_thr; /* per-thread fields (one per thread in the group) */ + + EXTRA_COUNTERS(extra_counters); +}; + +/* listener flags (16 bits) */ +#define LI_F_FINALIZED 0x0001 /* listener made it to the READY||LIMITED||FULL state at least once, may be suspended/resumed safely */ +#define LI_F_SUSPENDED 0x0002 /* listener has been suspended using suspend_listener(), it is either is LI_PAUSED or LI_ASSIGNED state */ + +/* Descriptor for a "bind" keyword. The ->parse() function returns 0 in case of + * success, or a combination of ERR_* flags if an error is encountered. The + * function pointer can be NULL if not implemented. The function also has an + * access to the current "bind" config line. The ->skip value tells the parser + * how many words have to be skipped after the keyword. + */ +struct bind_kw { + const char *kw; + int (*parse)(char **args, int cur_arg, struct proxy *px, struct bind_conf *conf, char **err); + int skip; /* nb of args to skip */ + int rhttp_ok; /* non-zero if kw is support for reverse HTTP bind */ +}; + +/* same as bind_kw but for crtlist keywords */ +struct ssl_crtlist_kw { + const char *kw; + int (*parse)(char **args, int cur_arg, struct proxy *px, struct ssl_bind_conf *conf, int from_cli, char **err); + int skip; /* nb of args to skip */ +}; + +/* + * A keyword list. It is a NULL-terminated array of keywords. It embeds a + * struct list in order to be linked to other lists, allowing it to easily + * be declared where it is needed, and linked without duplicating data nor + * allocating memory. It is also possible to indicate a scope for the keywords. + */ +struct bind_kw_list { + const char *scope; + struct list list; + struct bind_kw kw[VAR_ARRAY]; +}; + +/* The per-thread accept queue ring, must be a power of two minus 1 */ +#define ACCEPT_QUEUE_SIZE ((1<<10) - 1) + +/* head and tail are both 16 bits so that idx can be accessed atomically */ +struct accept_queue_ring { + uint32_t idx; /* (head << 16) | tail */ + struct tasklet *tasklet; /* tasklet of the thread owning this ring */ + struct connection *entry[ACCEPT_QUEUE_SIZE] __attribute((aligned(64))); +}; + + +#endif /* _HAPROXY_LISTENER_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/listener.h b/include/haproxy/listener.h new file mode 100644 index 0000000..5b3dc18 --- /dev/null +++ b/include/haproxy/listener.h @@ -0,0 +1,246 @@ +/* + * include/haproxy/listener.h + * This file declares listener management primitives. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LISTENER_H +#define _HAPROXY_LISTENER_H + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/listener-t.h> + +struct proxy; +struct task; + +int li_init_per_thr(struct listener *li); + +/* adjust the listener's state and its proxy's listener counters if needed */ +void listener_set_state(struct listener *l, enum li_state st); + +/* This function tries to temporarily disable a listener, depending on the OS + * capabilities. Linux unbinds the listen socket after a SHUT_RD, and ignores + * SHUT_WR. Solaris refuses either shutdown(). OpenBSD ignores SHUT_RD but + * closes upon SHUT_WR and refuses to rebind. So a common validation path + * involves SHUT_WR && listen && SHUT_RD. In case of success, the FD's polling + * is disabled. It normally returns non-zero, unless an error is reported. + * It will need to operate under the proxy's lock and the listener's lock. + * suspend() may totally stop a listener if it doesn't support the PAUSED + * state, in which case state will be set to ASSIGNED. + * The caller is responsible for indicating in lpx, lli whether the respective + * locks are already held (non-zero) or not (zero) so that the function pick + * the missing ones, in this order. + */ +int suspend_listener(struct listener *l, int lpx, int lli); + +/* This function tries to resume a temporarily disabled listener. + * The resulting state will either be LI_READY or LI_FULL. 0 is returned + * in case of failure to resume (eg: dead socket). + * It will need to operate under the proxy's lock and the listener's lock. + * The caller is responsible for indicating in lpx, lli whether the respective + * locks are already held (non-zero) or not (zero) so that the function pick + * the missing ones, in this order. + */ +int resume_listener(struct listener *l, int lpx, int lli); + +/* Same as resume_listener(), but will only work to resume from + * LI_FULL or LI_LIMITED states because we try to relax listeners that + * were temporarily restricted and not to resume inactive listeners that + * may have been paused or completely stopped in the meantime. + * Returns positive value for success and 0 for failure. + * It will need to operate under the proxy's lock and the listener's lock. + * The caller is responsible for indicating in lpx, lli whether the respective + * locks are already held (non-zero) or not (zero) so that the function pick + * the missing ones, in this order. + */ +int relax_listener(struct listener *l, int lpx, int lli); + +/* + * This function completely stops a listener. It will need to operate under the + * proxy's lock, the protocol's and the listener's lock. The caller is + * responsible for indicating in lpx, lpr, lli whether the respective locks are + * already held (non-zero) or not (zero) so that the function picks the missing + * ones, in this order. + */ +void stop_listener(struct listener *l, int lpx, int lpr, int lli); + +/* This function adds the specified listener's file descriptor to the polling + * lists if it is in the LI_LISTEN state. The listener enters LI_READY or + * LI_FULL state depending on its number of connections. In daemon mode, we + * also support binding only the relevant processes to their respective + * listeners. We don't do that in debug mode however. + */ +void enable_listener(struct listener *listener); + +/* Dequeues all listeners waiting for a resource the global wait queue */ +void dequeue_all_listeners(void); + +/* Dequeues all listeners waiting for a resource in proxy <px>'s queue */ +void dequeue_proxy_listeners(struct proxy *px); + +/* This function closes the listening socket for the specified listener, + * provided that it's already in a listening state. The listener enters the + * LI_ASSIGNED state, except if the FD is not closed, in which case it may + * remain in LI_LISTEN. Depending on the process's status (master or worker), + * the listener's bind options and the receiver's origin, it may or may not + * close the receiver's FD. Must be called with the lock held. + */ +void do_unbind_listener(struct listener *listener); + +/* This function closes the listening socket for the specified listener, + * provided that it's already in a listening state. The listener enters the + * LI_ASSIGNED state, except if the FD is not closed, in which case it may + * remain in LI_LISTEN. This function is intended to be used as a generic + * function for standard protocols. + */ +void unbind_listener(struct listener *listener); + +/* creates one or multiple listeners for bind_conf <bc> on sockaddr <ss> on port + * range <portl> to <porth>, and possibly attached to fd <fd> (or -1 for auto + * allocation). The address family is taken from ss->ss_family, and the protocol + * passed in <proto> must be usable on this family. The number of jobs and + * listeners is automatically increased by the number of listeners created. It + * returns non-zero on success, zero on error with the error message set in <err>. + */ +int create_listeners(struct bind_conf *bc, const struct sockaddr_storage *ss, + int portl, int porth, int fd, struct protocol *proto, char **err); +struct shard_info *shard_info_attach(struct receiver *rx, struct shard_info *si); +void shard_info_detach(struct receiver *rx); +struct listener *clone_listener(struct listener *src); + +/* Delete a listener from its protocol's list of listeners. The listener's + * state is automatically updated from LI_ASSIGNED to LI_INIT. The protocol's + * number of listeners is updated. Note that the listener must have previously + * been unbound. This is the generic function to use to remove a listener. + */ +void delete_listener(struct listener *listener); +void __delete_listener(struct listener *listener); + +/* This function is called on a read event from a listening socket, corresponding + * to an accept. It tries to accept as many connections as possible, and for each + * calls the listener's accept handler (generally the frontend's accept handler). + */ +void listener_accept(struct listener *l); + +/* Returns a suitable value for a listener's backlog. It uses the listener's, + * otherwise the frontend's backlog, otherwise the listener's maxconn, + * otherwise the frontend's maxconn, otherwise 1024. + */ +int listener_backlog(const struct listener *l); + +/* Notify the listener that a connection initiated from it was released. This + * is used to keep the connection count consistent and to possibly re-open + * listening when it was limited. + */ +void listener_release(struct listener *l); + +/* This function adds the specified <listener> to the protocol <proto>. It + * does nothing if the protocol was already added. The listener's state is + * automatically updated from LI_INIT to LI_ASSIGNED. The number of listeners + * for the protocol is updated. This must be called with the proto lock held. + */ +void default_add_listener(struct protocol *proto, struct listener *listener); + +/* default function used to unbind a listener. This is for use by standard + * protocols working on top of accepted sockets. The receiver's rx_unbind() + * will automatically be used after the listener is disabled if the socket is + * still bound. This must be used under the listener's lock. + */ +void default_unbind_listener(struct listener *listener); + +/* default function called to suspend a listener: it simply passes the call to + * the underlying receiver. This is find for most socket-based protocols. This + * must be called under the listener's lock. It will return non-zero on success, + * 0 on failure. If no receiver-level suspend is provided, the operation is + * assumed to succeed. + */ +int default_suspend_listener(struct listener *l); + +/* Tries to resume a suspended listener, and returns non-zero on success or + * zero on failure. On certain errors, an alert or a warning might be displayed. + * It must be called with the listener's lock held. Depending on the listener's + * state and protocol, a listen() call might be used to resume operations, or a + * call to the receiver's resume() function might be used as well. This is + * suitable as a default function for TCP and UDP. This must be called with the + * listener's lock held. + */ +int default_resume_listener(struct listener *l); + +/* Applies the thread mask, shards etc to the bind_conf. It normally returns 0 + * otherwie the number of errors. Upon error it may set error codes (ERR_*) in + * err_code. It is supposed to be called only once very late in the boot process + * after the bind_conf's thread_set is fixed. The function may emit warnings and + * alerts. Extra listeners may be created on the fly. + */ +int bind_complete_thread_setup(struct bind_conf *bind_conf, int *err_code); + +/* + * Registers the bind keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void bind_register_keywords(struct bind_kw_list *kwl); + +/* Return a pointer to the bind keyword <kw>, or NULL if not found. */ +struct bind_kw *bind_find_kw(const char *kw); + +/* Dumps all registered "bind" keywords to the <out> string pointer. */ +void bind_dump_kws(char **out); +const char *bind_find_best_kw(const char *word); +int bind_parse_args_list(struct bind_conf *bind_conf, char **args, int cur_arg, + const char *section, const char *file, int linenum); + +void bind_recount_thread_bits(struct bind_conf *conf); +unsigned int bind_map_thread_id(const struct bind_conf *conf, unsigned int r); +struct bind_conf *bind_conf_alloc(struct proxy *fe, const char *file, + int line, const char *arg, struct xprt_ops *xprt); +const char *listener_state_str(const struct listener *l); +struct task *accept_queue_process(struct task *t, void *context, unsigned int state); +struct task *manage_global_listener_queue(struct task *t, void *context, unsigned int state); + +extern struct accept_queue_ring accept_queue_rings[MAX_THREADS] __attribute__((aligned(64))); + +extern const char* li_status_st[LI_STATE_COUNT]; +enum li_status get_li_status(struct listener *l); + +/* number of times an accepted connection resulted in maxconn being reached */ +extern ullong maxconn_reached; + +static inline uint accept_queue_ring_len(const struct accept_queue_ring *ring) +{ + uint idx, head, tail, len; + + idx = _HA_ATOMIC_LOAD(&ring->idx); /* (head << 16) + tail */ + head = idx >> 16; + tail = idx & 0xffff; + len = tail + ACCEPT_QUEUE_SIZE - head; + if (len >= ACCEPT_QUEUE_SIZE) + len -= ACCEPT_QUEUE_SIZE; + return len; +} + +#endif /* _HAPROXY_LISTENER_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/log-t.h b/include/haproxy/log-t.h new file mode 100644 index 0000000..a0a25ac --- /dev/null +++ b/include/haproxy/log-t.h @@ -0,0 +1,277 @@ +/* + * include/haproxy/log-t.h + * This file contains definitions of log-related structures and macros. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LOG_T_H +#define _HAPROXY_LOG_T_H + +#include <sys/socket.h> +#include <sys/un.h> +#include <netinet/in.h> + +#include <haproxy/api-t.h> +#include <haproxy/ring-t.h> +#include <haproxy/thread-t.h> + + +#define NB_LOG_FACILITIES 24 +#define NB_LOG_LEVELS 8 +#define NB_LOG_HDR_MAX_ELEMENTS 15 +#define SYSLOG_PORT 514 +#define UNIQUEID_LEN 128 + +/* flags used in logformat_node->options */ +#define LOG_OPT_HEXA 0x00000001 +#define LOG_OPT_MANDATORY 0x00000002 +#define LOG_OPT_QUOTE 0x00000004 +#define LOG_OPT_REQ_CAP 0x00000008 +#define LOG_OPT_RES_CAP 0x00000010 +#define LOG_OPT_HTTP 0x00000020 +#define LOG_OPT_ESC 0x00000040 +#define LOG_OPT_MERGE_SPACES 0x00000080 + + +/* Fields that need to be extracted from the incoming connection or request for + * logging or for sending specific header information. They're set in px->to_log + * and appear as flags in session->logs.logwait, which are removed once the + * required information has been collected. + */ +#define LW_INIT 1 /* anything */ +#define LW_CLIP 2 /* CLient IP */ +#define LW_SVIP 4 /* SerVer IP */ +#define LW_SVID 8 /* server ID */ +#define LW_REQ 16 /* http REQuest */ +#define LW_RESP 32 /* http RESPonse */ +#define LW_BYTES 256 /* bytes read from server */ +#define LW_COOKIE 512 /* captured cookie */ +#define LW_REQHDR 1024 /* request header(s) */ +#define LW_RSPHDR 2048 /* response header(s) */ +#define LW_BCKIP 4096 /* backend IP */ +#define LW_FRTIP 8192 /* frontend IP */ +#define LW_XPRT 16384 /* transport layer information (eg: SSL) */ + +#define LOG_LEGACYTIME_LEN 15 +#define LOG_ISOTIME_MINLEN 20 +#define LOG_ISOTIME_MAXLEN 32 + +/* enum for log format */ +enum log_fmt { + LOG_FORMAT_UNSPEC = 0, + LOG_FORMAT_LOCAL, + LOG_FORMAT_RFC3164, + LOG_FORMAT_RFC5424, + LOG_FORMAT_PRIO, + LOG_FORMAT_SHORT, + LOG_FORMAT_TIMED, + LOG_FORMAT_ISO, + LOG_FORMAT_RAW, + LOG_FORMATS /* number of supported log formats, must always be last */ +}; + +/* enum log header meta data */ +enum log_meta { + LOG_META_PRIO, + LOG_META_TIME, + LOG_META_HOST, + LOG_META_TAG, + LOG_META_PID, + LOG_META_MSGID, + LOG_META_STDATA, + LOG_META_FIELDS /* must always be the last */ +}; + +/* log header data */ +struct log_header { + enum log_fmt format; /* how to format the header */ + int level, facility; /* used by several formats */ + struct ist *metadata; /* optional metadata - per-format */ +}; + +#define LOG_HEADER_NONE (struct log_header){ \ + .format = LOG_FORMAT_UNSPEC, \ + .level = 0, \ + .facility = 0, \ + .metadata = NULL \ + } + +/* log target types */ +enum log_tgt { + LOG_TARGET_DGRAM = 0, // datagram address (udp, unix socket) + LOG_TARGET_FD, // file descriptor + LOG_TARGET_BUFFER, // ring buffer + LOG_TARGET_BACKEND, // backend with SYSLOG mode +}; + +/* lists of fields that can be logged, for logformat_node->type */ +enum { + + LOG_FMT_TEXT = 0, /* raw text */ + LOG_FMT_EXPR, /* sample expression */ + LOG_FMT_SEPARATOR, /* separator replaced by one space */ + + /* information fields */ + LOG_FMT_GLOBAL, + LOG_FMT_CLIENTIP, + LOG_FMT_CLIENTPORT, + LOG_FMT_BACKENDIP, + LOG_FMT_BACKENDPORT, + LOG_FMT_FRONTENDIP, + LOG_FMT_FRONTENDPORT, + LOG_FMT_SERVERPORT, + LOG_FMT_SERVERIP, + LOG_FMT_COUNTER, + LOG_FMT_LOGCNT, + LOG_FMT_PID, + LOG_FMT_DATE, + LOG_FMT_DATEGMT, + LOG_FMT_DATELOCAL, + LOG_FMT_TS, + LOG_FMT_MS, + LOG_FMT_FRONTEND, + LOG_FMT_FRONTEND_XPRT, + LOG_FMT_BACKEND, + LOG_FMT_SERVER, + LOG_FMT_BYTES, + LOG_FMT_BYTES_UP, + LOG_FMT_Ta, + LOG_FMT_Th, + LOG_FMT_Ti, + LOG_FMT_TQ, + LOG_FMT_TW, + LOG_FMT_TC, + LOG_FMT_Tr, + LOG_FMT_tr, + LOG_FMT_trg, + LOG_FMT_trl, + LOG_FMT_TR, + LOG_FMT_TD, + LOG_FMT_TT, + LOG_FMT_TU, + LOG_FMT_STATUS, + LOG_FMT_CCLIENT, + LOG_FMT_CSERVER, + LOG_FMT_TERMSTATE, + LOG_FMT_TERMSTATE_CK, + LOG_FMT_ACTCONN, + LOG_FMT_FECONN, + LOG_FMT_BECONN, + LOG_FMT_SRVCONN, + LOG_FMT_RETRIES, + LOG_FMT_SRVQUEUE, + LOG_FMT_BCKQUEUE, + LOG_FMT_HDRREQUEST, + LOG_FMT_HDRRESPONS, + LOG_FMT_HDRREQUESTLIST, + LOG_FMT_HDRRESPONSLIST, + LOG_FMT_REQ, + LOG_FMT_HTTP_METHOD, + LOG_FMT_HTTP_URI, + LOG_FMT_HTTP_PATH, + LOG_FMT_HTTP_PATH_ONLY, + LOG_FMT_HTTP_QUERY, + LOG_FMT_HTTP_VERSION, + LOG_FMT_HOSTNAME, + LOG_FMT_UNIQUEID, + LOG_FMT_SSL_CIPHER, + LOG_FMT_SSL_VERSION, +}; + +/* enum for parse_logformat_string */ +enum { + LF_INIT = 0, // before first character + LF_TEXT, // normal text + LF_SEPARATOR, // a single separator + LF_VAR, // variable name, after '%' or '%{..}' + LF_STARTVAR, // % in text + LF_STARG, // after '%{' and berore '}' + LF_EDARG, // '}' after '%{' + LF_STEXPR, // after '%[' or '%{..}[' and berore ']' + LF_EDEXPR, // ']' after '%[' + LF_END, // \0 found +}; + + +struct logformat_node { + struct list list; + int type; // LOG_FMT_* + int options; // LOG_OPT_* + char *arg; // text for LOG_FMT_TEXT, arg for others + void *expr; // for use with LOG_FMT_EXPR +}; + +/* Range of indexes for log sampling. */ +struct smp_log_range { + unsigned int low; /* Low limit of the indexes of this range. */ + unsigned int high; /* High limit of the indexes of this range. */ + size_t sz; /* The size of this range, or number of indexes in + * this range. + */ +}; + +/* Log sampling information. */ +struct smp_info { + struct smp_log_range *smp_rgs; /* Array of ranges for log sampling. */ + size_t smp_rgs_sz; /* The size of <smp_rgs> array. */ + size_t smp_sz; /* The total number of logs to be sampled. */ + ullong curr_rg_idx; /* 63:32 = current range; 31:0 = current index */ +}; + +enum log_target_flags { + LOG_TARGET_FL_NONE = 0x00, + LOG_TARGET_FL_RESOLVED = 0x01 +}; + +struct log_target { + struct sockaddr_storage *addr; + union { + char *ring_name; /* type = BUFFER - preparsing */ + struct sink *sink; /* type = BUFFER - postparsing */ + char *be_name; /* type = BACKEND - preparsing */ + struct proxy *be; /* type = BACKEND - postparsing */ + char *resolv_name; /* generic - preparsing */ + }; + enum log_tgt type; + uint16_t flags; +}; + +struct logger { + struct list list; + struct log_target target; + struct smp_info lb; + enum log_fmt format; + int facility; + int level; + int minlvl; + int maxlen; + struct logger *ref; + struct { + char *file; /* file where the logger appears */ + int line; /* line where the logger appears */ + } conf; +}; + +#endif /* _HAPROXY_LOG_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/log.h b/include/haproxy/log.h new file mode 100644 index 0000000..68b8207 --- /dev/null +++ b/include/haproxy/log.h @@ -0,0 +1,195 @@ +/* + * include/haproxy/log.h + * This file contains definitions of log-related functions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_LOG_H +#define _HAPROXY_LOG_H + +#include <syslog.h> + +#include <haproxy/api.h> +#include <haproxy/log-t.h> +#include <haproxy/pool-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stream.h> + +extern struct pool_head *pool_head_requri; +extern struct pool_head *pool_head_uniqueid; + +extern const char *log_levels[]; +extern char *log_format; +extern char httpclient_log_format[]; +extern char default_tcp_log_format[]; +extern char default_http_log_format[]; +extern char clf_http_log_format[]; +extern char default_https_log_format[]; + +extern char default_rfc5424_sd_log_format[]; + +extern const char sess_term_cond[]; +extern const char sess_fin_state[]; + +extern unsigned int dropped_logs; + +/* lof forward proxy list */ +extern struct proxy *cfg_log_forward; + +extern THREAD_LOCAL char *logline; +extern THREAD_LOCAL char *logline_rfc5424; + +/* global syslog message counter */ +extern int cum_log_messages; + +/* syslog UDP message handler */ +void syslog_fd_handler(int fd); + +/* Initialize/Deinitialize log buffers used for syslog messages */ +int init_log_buffers(void); +void deinit_log_buffers(void); + +/* build a log line for the session and an optional stream */ +int sess_build_logline(struct session *sess, struct stream *s, char *dst, size_t maxsize, struct list *list_format); + +/* + * send a log for the stream when we have enough info about it. + * Will not log if the frontend has no log defined. + */ +void strm_log(struct stream *s); +void sess_log(struct session *sess); + +/* send a applicative log with custom list of loggers */ +void app_log(struct list *loggers, struct buffer *tag, int level, const char *format, ...) + __attribute__ ((format(printf, 4, 5))); + +/* + * add to the logformat linked list + */ +int add_to_logformat_list(char *start, char *end, int type, struct list *list_format, char **err); + +/* + * Parse the log_format string and fill a linked list. + * Variable name are preceded by % and composed by characters [a-zA-Z0-9]* : %varname + * You can set arguments using { } : %{many arguments}varname + */ +int parse_logformat_string(const char *str, struct proxy *curproxy, struct list *list_format, int options, int cap, char **err); + +int postresolve_logger_list(struct list *loggers, const char *section, const char *section_name); + +struct logger *dup_logger(struct logger *def); +void free_logger(struct logger *logger); +void deinit_log_target(struct log_target *target); + +/* Parse "log" keyword and update the linked list. */ +int parse_logger(char **args, struct list *loggers, int do_del, const char *file, int linenum, char **err); + +/* + * This function adds a header to the message and sends the syslog message + * using a printf format string + */ +void send_log(struct proxy *p, int level, const char *format, ...) + __attribute__ ((format(printf, 3, 4))); + +/* + * This function sends a syslog message to all loggers of a proxy, + * or to global loggers if the proxy is NULL. + * It also tries not to waste too much time computing the message header. + * It doesn't care about errors nor does it report them. + */ + +void __send_log(struct list *loggers, struct buffer *tag, int level, char *message, size_t size, char *sd, size_t sd_size); + +/* + * returns log format for <fmt> or LOG_FORMAT_UNSPEC if not found. + */ +enum log_fmt get_log_format(const char *fmt); + +/* + * returns log level for <lev> or -1 if not found. + */ +int get_log_level(const char *lev); + +/* + * returns log facility for <fac> or -1 if not found. + */ +int get_log_facility(const char *fac); + +/* + * Write a string in the log string + * Take cares of quote options + * + * Return the address of the \0 character, or NULL on error + */ +char *lf_text_len(char *dst, const char *src, size_t len, size_t size, const struct logformat_node *node); + +/* + * Write a IP address to the log string + * +X option write in hexadecimal notation, most significant byte on the left + */ +char *lf_ip(char *dst, const struct sockaddr *sockaddr, size_t size, const struct logformat_node *node); + +/* + * Write a port to the log + * +X option write in hexadecimal notation, most significant byte on the left + */ +char *lf_port(char *dst, const struct sockaddr *sockaddr, size_t size, const struct logformat_node *node); + + +/* + * Function to handle log header building (exported for sinks) + */ +char *update_log_hdr_rfc5424(const time_t time, suseconds_t frac); +char *update_log_hdr(const time_t time); +char * get_format_pid_sep1(int format, size_t *len); +char * get_format_pid_sep2(int format, size_t *len); + +/* + * Builds a log line for the stream (must be valid). + */ +static inline int build_logline(struct stream *s, char *dst, size_t maxsize, struct list *list_format) +{ + return sess_build_logline(strm_sess(s), s, dst, maxsize, list_format); +} + +struct ist *build_log_header(struct log_header hdr, size_t *nbelem); + +/* + * lookup log forward proxy by name + * Returns NULL if no proxy found. + */ +static inline struct proxy *log_forward_by_name(const char *name) +{ + struct proxy *px = cfg_log_forward; + + while (px) { + if (strcmp(px->id, name) == 0) + return px; + px = px->next; + } + return NULL; +} + +#endif /* _HAPROXY_LOG_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/mailers-t.h b/include/haproxy/mailers-t.h new file mode 100644 index 0000000..0fa3197 --- /dev/null +++ b/include/haproxy/mailers-t.h @@ -0,0 +1,83 @@ +/* + * include/haproxy/mailer-t.h + * This file defines everything related to mailer. + * + * Copyright 2015 Horms Solutions Ltd., Simon Horman <horms@verge.net.au> + * + * Based on include/haproxy/peers-t.h + * + * Copyright 2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MAILERS_T_H +#define _HAPROXY_MAILERS_T_H + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <haproxy/check-t.h> +#include <haproxy/tcpcheck-t.h> +#include <haproxy/thread-t.h> + +struct mailer { + char *id; + struct mailers *mailers; + struct { + const char *file; /* file where the section appears */ + int line; /* line where the section appears */ + } conf; /* config information */ + struct sockaddr_storage addr; /* SMTP server address */ + struct protocol *proto; /* SMTP server address's protocol */ + struct xprt_ops *xprt; /* SMTP server socket operations at transport layer */ + void *sock_init_arg; /* socket operations's opaque init argument if needed */ + struct mailer *next; /* next mailer in the list */ +}; + +struct mailers { + char *id; /* mailers section name */ + struct mailer *mailer_list; /* mailers in this mailers section */ + struct { + const char *file; /* file where the section appears */ + int line; /* line where the section appears */ + } conf; /* config information */ + struct mailers *next; /* next mailers section */ + int count; /* total number of mailers in this mailers section */ + int users; /* number of users of this mailers section */ + struct { /* time to: */ + int mail; /* try connecting to mailserver and sending a email */ + } timeout; +}; + +struct email_alert { + struct list list; + struct tcpcheck_rules rules; + struct server *srv; +}; + +struct email_alertq { + struct list email_alerts; + struct check check; /* Email alerts are implemented using existing check + * code even though they are not checks. This structure + * is as a parameter to the check code. + * Each check corresponds to a mailer */ + __decl_thread(HA_SPINLOCK_T lock); +}; + +#endif /* _HAPROXY_MAILERS_T_H */ + diff --git a/include/haproxy/mailers.h b/include/haproxy/mailers.h new file mode 100644 index 0000000..89aa1b0 --- /dev/null +++ b/include/haproxy/mailers.h @@ -0,0 +1,42 @@ +/* + * include/haproxy/mailer.h + * This file lists exported variables and functions for mailers. + * + * Copyright 2015 Horms Solutions Ltd., Simon Horman <horms@verge.net.au> + * Copyright 2020 Willy Tarreau <w@1wt.eu> + * + * Based on include/haproxy/peers-t.h + * + * Copyright 2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MAILERS_H +#define _HAPROXY_MAILERS_H + +#include <haproxy/mailers-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> + +extern struct mailers *mailers; +extern int send_email_disabled; + +int init_email_alert(struct mailers *mailers, struct proxy *p, char **err); +void send_email_alert(struct server *s, int priority, const char *format, ...) + __attribute__ ((format(printf, 3, 4))); + + +#endif /* _HAPROXY_MAILERS_H */ diff --git a/include/haproxy/map-t.h b/include/haproxy/map-t.h new file mode 100644 index 0000000..d6085ee --- /dev/null +++ b/include/haproxy/map-t.h @@ -0,0 +1,34 @@ +/* + * include/haproxy/map-t.h + * This file provides structures and types for MAPs. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MAP_T_H +#define _HAPROXY_MAP_T_H + +#include <haproxy/pattern-t.h> +#include <haproxy/sample-t.h> + +struct map_descriptor { + struct sample_conv *conv; /* original converter descriptor */ + struct pattern_head pat; /* the pattern matching associated to the map */ + int do_free; /* set if <pat> is the original pat and must be freed */ +}; + +#endif /* _HAPROXY_MAP_T_H */ diff --git a/include/haproxy/map.h b/include/haproxy/map.h new file mode 100644 index 0000000..3ec3418 --- /dev/null +++ b/include/haproxy/map.h @@ -0,0 +1,39 @@ +/* + * include/haproxy/map.h + * This file provides structures and types for pattern matching. + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MAP_H +#define _HAPROXY_MAP_H + +#include <haproxy/map-t.h> +#include <haproxy/sample-t.h> + +/* maps output sample parser */ +int map_parse_ip(const char *text, struct sample_data *data); +int map_parse_ip6(const char *text, struct sample_data *data); +int map_parse_str(const char *text, struct sample_data *data); +int map_parse_int(const char *text, struct sample_data *data); + +struct map_reference *map_get_reference(const char *reference); + +int sample_load_map(struct arg *arg, struct sample_conv *conv, + const char *file, int line, char **err); + +#endif /* _HAPROXY_MAP_H */ diff --git a/include/haproxy/mqtt-t.h b/include/haproxy/mqtt-t.h new file mode 100644 index 0000000..51f55ea --- /dev/null +++ b/include/haproxy/mqtt-t.h @@ -0,0 +1,310 @@ +/* + * include/haproxy/mqtt.h + * This file contains structure declarations for MQTT protocol. + * + * Copyright 2020 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MQTT_T_H +#define _HAPROXY_MQTT_T_H + +#include <import/ist.h> + +/* MQTT protocol version + * In MQTT 3.1.1, version is called "level" + */ +#define MQTT_VERSION_3_1 3 +#define MQTT_VERSION_3_1_1 4 +#define MQTT_VERSION_5_0 5 + +/* + * return code when parsing / validating MQTT messages + */ +#define MQTT_INVALID_MESSAGE -1 +#define MQTT_NEED_MORE_DATA 0 +#define MQTT_VALID_MESSAGE 1 + + +/* + * MQTT Control Packet Type: MQTT_CPT_* + * + * Part of the fixed headers, encoded on the first packet byte : + * + * +-------+-----------+-----------+-----------+---------+----------+----------+---------+------------+ + * | bit | 7 | 6 | 5 | 4 | 3 | 2 | 1 | 0 | + * +-------+-----------+-----------+-----------+---------+----------+----------+---------+------------+ + * | field | MQTT Control Packet Type | Flags specific to each Control Packet type | + * +-------+---------------------------------------------+--------------------------------------------+ + * + * Don't forget to "left offset by 4 bits (<< 4)" the values below when matching against the fixed + * header collected in a MQTT packet. + * + * value 0x0 is reserved and forbidden + */ +enum { + MQTT_CPT_INVALID = 0, + + MQTT_CPT_CONNECT, + MQTT_CPT_CONNACK, + MQTT_CPT_PUBLISH, + MQTT_CPT_PUBACK, + MQTT_CPT_PUBREC, + MQTT_CPT_PUBREL, + MQTT_CPT_PUBCOMP, + MQTT_CPT_SUBSCRIBE, + MQTT_CPT_SUBACK, + MQTT_CPT_UNSUBSCRIBE, + MQTT_CPT_UNSUBACK, + MQTT_CPT_PINGREQ, + MQTT_CPT_PINGRESP, + MQTT_CPT_DISCONNECT, + MQTT_CPT_AUTH, + MQTT_CPT_ENTRIES /* used to mark the end/size of our MQTT_CPT_* list */ +}; + +/* MQTT CONNECT packet flags */ +#define MQTT_CONNECT_FL_RESERVED 0x01 +#define MQTT_CONNECT_FL_CLEAN_SESSION 0x02 +#define MQTT_CONNECT_FL_WILL 0x04 +#define MQTT_CONNECT_FL_WILL_QOS 0x18 /* covers 2 bits 00011000 */ +#define MQTT_CONNECT_FL_WILL_RETAIN 0x20 +#define MQTT_CONNECT_FL_PASSWORD 0x40 +#define MQTT_CONNECT_FL_USERNAME 0x80 + +/* MQTT packet properties identifiers + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901029 + */ +#define MQTT_PROP_PAYLOAD_FORMAT_INDICATOR 0x01 +#define MQTT_PROP_MESSAGE_EXPIRY_INTERVAL 0x02 +#define MQTT_PROP_CONTENT_TYPE 0x03 +#define MQTT_PROP_RESPONSE_TOPIC 0x08 +#define MQTT_PROP_CORRELATION_DATA 0x09 +#define MQTT_PROP_SESSION_EXPIRY_INTERVAL 0x11 +#define MQTT_PROP_ASSIGNED_CLIENT_IDENTIFIER 0x12 +#define MQTT_PROP_SERVER_KEEPALIVE 0x13 +#define MQTT_PROP_AUTHENTICATION_METHOD 0x15 +#define MQTT_PROP_AUTHENTICATION_DATA 0x16 +#define MQTT_PROP_REQUEST_PROBLEM_INFORMATION 0x17 +#define MQTT_PROP_WILL_DELAY_INTERVAL 0x18 +#define MQTT_PROP_REQUEST_RESPONSE_INFORMATION 0x19 +#define MQTT_PROP_RESPONSE_INFORMATION 0x1A +#define MQTT_PROP_SERVER_REFERENCE 0x1C +#define MQTT_PROP_RECEIVE_MAXIMUM 0x21 +#define MQTT_PROP_TOPIC_ALIAS_MAXIMUM 0x22 +#define MQTT_PROP_MAXIMUM_QOS 0x24 +#define MQTT_PROP_RETAIN_AVAILABLE 0x25 +#define MQTT_PROP_USER_PROPERTIES 0x26 +#define MQTT_PROP_MAXIMUM_PACKET_SIZE 0x27 +#define MQTT_PROP_WILDCARD_SUBSCRIPTION_AVAILABLE 0x28 +#define MQTT_PROP_SUBSCRIPTION_IDENTIFIERS_AVAILABLE 0x29 +#define MQTT_PROP_SHARED_SUBSRIPTION_AVAILABLE 0x2A +#define MQTT_PROP_REASON_STRING 0x1F +#define MQTT_PROP_LAST 0xFF + +/* MQTT minimal packet size */ +#define MQTT_MIN_PKT_SIZE 2 +#define MQTT_REMAINING_LENGHT_MAX_SIZE 4 + +/* list of supported capturable Field Names and configuration file string */ +enum { + MQTT_FN_INVALID = 0, + + MQTT_FN_FLAGS, + MQTT_FN_REASON_CODE, + MQTT_FN_PROTOCOL_NAME, + MQTT_FN_PROTOCOL_VERSION, + MQTT_FN_CLIENT_IDENTIFIER, + MQTT_FN_WILL_TOPIC, + MQTT_FN_WILL_PAYLOAD, + MQTT_FN_USERNAME, + MQTT_FN_PASSWORD, + MQTT_FN_KEEPALIVE, + + /* MQTT 5.0 properties + * https://docs.oasis-open.org/mqtt/mqtt/v5.0/os/mqtt-v5.0-os.html#_Toc3901029 + */ + MQTT_FN_PAYLOAD_FORMAT_INDICATOR, + MQTT_FN_MESSAGE_EXPIRY_INTERVAL, + MQTT_FN_CONTENT_TYPE, + MQTT_FN_RESPONSE_TOPIC, + MQTT_FN_CORRELATION_DATA, + MQTT_FN_SUBSCRIPTION_IDENTIFIER, + MQTT_FN_SESSION_EXPIRY_INTERVAL, + MQTT_FN_ASSIGNED_CLIENT_IDENTIFIER, + MQTT_FN_SERVER_KEEPALIVE, + MQTT_FN_AUTHENTICATION_METHOD, + MQTT_FN_AUTHENTICATION_DATA, + MQTT_FN_REQUEST_PROBLEM_INFORMATION, + MQTT_FN_DELAY_INTERVAL, + MQTT_FN_REQUEST_RESPONSE_INFORMATION, + MQTT_FN_RESPONSE_INFORMATION, + MQTT_FN_SERVER_REFERENCE, + MQTT_FN_REASON_STRING, + MQTT_FN_RECEIVE_MAXIMUM, + MQTT_FN_TOPIC_ALIAS_MAXIMUM, + MQTT_FN_TOPIC_ALIAS, + MQTT_FN_MAXIMUM_QOS, + MQTT_FN_RETAIN_AVAILABLE, + MQTT_FN_USER_PROPERTY, + MQTT_FN_MAXIMUM_PACKET_SIZE, + MQTT_FN_WILDCARD_SUBSCRIPTION_AVAILABLE, + MQTT_FN_SUBSCRIPTION_IDENTIFIERS_AVAILABLE, + MQTT_FN_SHARED_SUBSCRIPTION_AVAILABLE, + + MQTT_FN_ENTRIES /* this one must always be the latest one */ +}; + +/* MQTT field string bit, for easy match using bitmasks + * ATTENTION: "user-properties" are not supported for now + */ +enum { + MQTT_FN_BIT_FLAGS = (1ULL << MQTT_FN_FLAGS), + MQTT_FN_BIT_REASON_CODE = (1ULL << MQTT_FN_REASON_CODE), + MQTT_FN_BIT_PROTOCOL_NAME = (1ULL << MQTT_FN_PROTOCOL_NAME), + MQTT_FN_BIT_PROTOCOL_VERSION = (1ULL << MQTT_FN_PROTOCOL_VERSION), + MQTT_FN_BIT_CLIENT_IDENTIFIER = (1ULL << MQTT_FN_CLIENT_IDENTIFIER), + MQTT_FN_BIT_WILL_TOPIC = (1ULL << MQTT_FN_WILL_TOPIC), + MQTT_FN_BIT_WILL_PAYLOAD = (1ULL << MQTT_FN_WILL_PAYLOAD), + MQTT_FN_BIT_USERNAME = (1ULL << MQTT_FN_USERNAME), + MQTT_FN_BIT_PASSWORD = (1ULL << MQTT_FN_PASSWORD), + MQTT_FN_BIT_KEEPALIVE = (1ULL << MQTT_FN_KEEPALIVE), + MQTT_FN_BIT_PAYLOAD_FORMAT_INDICATOR = (1ULL << MQTT_FN_PAYLOAD_FORMAT_INDICATOR), + MQTT_FN_BIT_MESSAGE_EXPIRY_INTERVAL = (1ULL << MQTT_FN_MESSAGE_EXPIRY_INTERVAL), + MQTT_FN_BIT_CONTENT_TYPE = (1ULL << MQTT_FN_CONTENT_TYPE), + MQTT_FN_BIT_RESPONSE_TOPIC = (1ULL << MQTT_FN_RESPONSE_TOPIC), + MQTT_FN_BIT_CORRELATION_DATA = (1ULL << MQTT_FN_CORRELATION_DATA), + MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIER = (1ULL << MQTT_FN_SUBSCRIPTION_IDENTIFIER), + MQTT_FN_BIT_SESSION_EXPIRY_INTERVAL = (1ULL << MQTT_FN_SESSION_EXPIRY_INTERVAL), + MQTT_FN_BIT_ASSIGNED_CLIENT_IDENTIFIER = (1ULL << MQTT_FN_ASSIGNED_CLIENT_IDENTIFIER), + MQTT_FN_BIT_SERVER_KEEPALIVE = (1ULL << MQTT_FN_SERVER_KEEPALIVE), + MQTT_FN_BIT_AUTHENTICATION_METHOD = (1ULL << MQTT_FN_AUTHENTICATION_METHOD), + MQTT_FN_BIT_AUTHENTICATION_DATA = (1ULL << MQTT_FN_AUTHENTICATION_DATA), + MQTT_FN_BIT_REQUEST_PROBLEM_INFORMATION = (1ULL << MQTT_FN_REQUEST_PROBLEM_INFORMATION), + MQTT_FN_BIT_DELAY_INTERVAL = (1ULL << MQTT_FN_DELAY_INTERVAL), + MQTT_FN_BIT_REQUEST_RESPONSE_INFORMATION = (1ULL << MQTT_FN_REQUEST_RESPONSE_INFORMATION), + MQTT_FN_BIT_RESPONSE_INFORMATION = (1ULL << MQTT_FN_RESPONSE_INFORMATION), + MQTT_FN_BIT_SERVER_REFERENCE = (1ULL << MQTT_FN_SERVER_REFERENCE), + MQTT_FN_BIT_REASON_STRING = (1ULL << MQTT_FN_REASON_STRING), + MQTT_FN_BIT_RECEIVE_MAXIMUM = (1ULL << MQTT_FN_RECEIVE_MAXIMUM), + MQTT_FN_BIT_TOPIC_ALIAS_MAXIMUM = (1ULL << MQTT_FN_TOPIC_ALIAS_MAXIMUM), + MQTT_FN_BIT_TOPIC_ALIAS = (1ULL << MQTT_FN_TOPIC_ALIAS), + MQTT_FN_BIT_MAXIMUM_QOS = (1ULL << MQTT_FN_MAXIMUM_QOS), + MQTT_FN_BIT_RETAIN_AVAILABLE = (1ULL << MQTT_FN_RETAIN_AVAILABLE), + MQTT_FN_BIT_USER_PROPERTY = (1ULL << MQTT_FN_USER_PROPERTY), + MQTT_FN_BIT_MAXIMUM_PACKET_SIZE = (1ULL << MQTT_FN_MAXIMUM_PACKET_SIZE), + MQTT_FN_BIT_WILDCARD_SUBSCRIPTION_AVAILABLE = (1ULL << MQTT_FN_WILDCARD_SUBSCRIPTION_AVAILABLE), + MQTT_FN_BIT_SUBSCRIPTION_IDENTIFIERS_AVAILABLE= (1ULL << MQTT_FN_SUBSCRIPTION_IDENTIFIERS_AVAILABLE), + MQTT_FN_BIT_SHARED_SUBSCRIPTION_AVAILABLE = (1ULL << MQTT_FN_SHARED_SUBSCRIPTION_AVAILABLE), +}; + +/* structure to host fields for a MQTT CONNECT packet */ +#define MQTT_PROP_USER_PROPERTY_ENTRIES 5 +struct connect { + struct { + struct ist protocol_name; + uint8_t protocol_version; + uint8_t flags; + uint16_t keepalive; + + struct { + uint32_t session_expiry_interval; + uint16_t receive_maximum; + uint32_t maximum_packet_size; + uint16_t topic_alias_maximum; + uint8_t request_response_information; + uint8_t request_problem_information; + struct { + struct ist name; + struct ist value; + } user_props[MQTT_PROP_USER_PROPERTY_ENTRIES]; + struct ist authentication_method; + struct ist authentication_data; + } props; + } var_hdr; + struct { + struct ist client_identifier; + struct { + uint32_t delay_interval; + uint8_t payload_format_indicator; + uint32_t message_expiry_interval; + struct ist content_type; + struct ist response_topic; + struct ist correlation_data; + struct { + struct ist name; + struct ist value; + } user_props[MQTT_PROP_USER_PROPERTY_ENTRIES]; + } will_props; + struct ist will_topic; + struct ist will_payload; + struct ist username; + struct ist password; + } payload; +}; + +/* structure to host fields for a MQTT CONNACK packet */ +struct connack { + struct { + uint8_t protocol_version; + uint8_t flags; + uint8_t reason_code; + struct { + uint32_t session_expiry_interval; + uint16_t receive_maximum; + uint8_t maximum_qos; + uint8_t retain_available; + uint32_t maximum_packet_size; + struct ist assigned_client_identifier; + uint16_t topic_alias_maximum; + struct ist reason_string; + struct { + struct ist name; + struct ist value; + } user_props[MQTT_PROP_USER_PROPERTY_ENTRIES]; + uint8_t wildcard_subscription_available; + uint8_t subscription_identifiers_available; + uint8_t shared_subsription_available; + uint16_t server_keepalive; + struct ist response_information; + struct ist server_reference; + struct ist authentication_method; + struct ist authentication_data; + } props; + } var_hdr; +}; + +/* structure to host a MQTT packet */ +struct mqtt_pkt { + struct { + uint8_t type; /* MQTT_CPT_* */ + uint8_t flags; /* MQTT_CPT_FL* */ + uint32_t remaining_length; + } fixed_hdr; + union { + struct connect connect; + struct connack connack; + } data; +}; + +#endif /* _HAPROXY_MQTT_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/mqtt.h b/include/haproxy/mqtt.h new file mode 100644 index 0000000..6720bb7 --- /dev/null +++ b/include/haproxy/mqtt.h @@ -0,0 +1,118 @@ +/* + * include/haproxt/mqtt.h + * This file contains structure declarations for MQTT protocol. + * + * Copyright 2020 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MQTT_H +#define _HAPROXY_MQTT_H + +#include <import/ist.h> + +#include <haproxy/mqtt-t.h> +#include <haproxy/tools.h> + +/* expected flags for control packets */ +extern uint8_t mqtt_cpt_flags[MQTT_CPT_ENTRIES]; + +/* MQTT field string names */ +extern const struct ist mqtt_fields_string[MQTT_FN_ENTRIES]; + +/* list of supported capturable field names for each MQTT control packet type */ +extern const uint64_t mqtt_fields_per_packet[MQTT_CPT_ENTRIES]; + +int mqtt_validate_message(const struct ist msg, struct mqtt_pkt *mpkt); +struct ist mqtt_field_value(const struct ist msg, int type, int fieldname_id); + +/* + * Return a MQTT packet type ID based found in <str>. + * <str> can be a number or a string and returned value will always be the numeric value. + * + * If <str> can't be translated into an ID, then MQTT_CPT_INVALID (0) is returned. + */ +static inline int mqtt_typeid(struct ist str) +{ + int id; + + id = strl2ui(str.ptr, istlen(str)); + if ((id >= MQTT_CPT_CONNECT) && (id < MQTT_CPT_ENTRIES)) + return id; + + else if (isteqi(str, ist("CONNECT")) != 0) + return MQTT_CPT_CONNECT; + else if (isteqi(str, ist("CONNACK")) != 0) + return MQTT_CPT_CONNACK; + else if (isteqi(str, ist("PUBLISH")) != 0) + return MQTT_CPT_PUBLISH; + else if (isteqi(str, ist("PUBACK")) != 0) + return MQTT_CPT_PUBACK; + else if (isteqi(str, ist("PUBREC")) != 0) + return MQTT_CPT_PUBREC; + else if (isteqi(str, ist("PUBREL")) != 0) + return MQTT_CPT_PUBREL; + else if (isteqi(str, ist("PUBCOMP")) != 0) + return MQTT_CPT_PUBCOMP; + else if (isteqi(str, ist("SUBSCRIBE")) != 0) + return MQTT_CPT_SUBSCRIBE; + else if (isteqi(str, ist("SUBACK")) != 0) + return MQTT_CPT_SUBACK; + else if (isteqi(str, ist("UNSUBSCRIBE")) != 0) + return MQTT_CPT_UNSUBSCRIBE; + else if (isteqi(str, ist("UNSUBACK")) != 0) + return MQTT_CPT_UNSUBACK; + else if (isteqi(str, ist("PINGREQ")) != 0) + return MQTT_CPT_PINGREQ; + else if (isteqi(str, ist("PINGRESP")) != 0) + return MQTT_CPT_PINGRESP; + else if (isteqi(str, ist("DISCONNECT")) != 0) + return MQTT_CPT_DISCONNECT; + else if (isteqi(str, ist("AUTH")) != 0) + return MQTT_CPT_AUTH; + + return MQTT_CPT_INVALID; +} + +/* + * validate that <str> is a field that can be extracted from a <type> MQTT packet + * + * return the field name ID (MQTT_FN_*) if a match is found, MQTT_FN_INVALID (0) otherwise. + */ +static inline int mqtt_check_type_fieldname(int type, struct ist str) +{ + int i, id = MQTT_FN_INVALID; + + for (i = 0; i < MQTT_FN_ENTRIES; i++) { + if (isteqi(str, mqtt_fields_string[i])) { + if (mqtt_fields_per_packet[type] & (1ULL << i)) + id = i; + break; + } + } + + return id; + +} + +#endif /* _HAPROXY_MQTT_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/mux_fcgi-t.h b/include/haproxy/mux_fcgi-t.h new file mode 100644 index 0000000..27973db --- /dev/null +++ b/include/haproxy/mux_fcgi-t.h @@ -0,0 +1,175 @@ +/* + * include/haproxy/mux_fcgi-t.h + * Definitions for basic FCGI mux internal types, constants and flags. + * + * Copyright 2022 Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MUX_FCGI_T_H +#define _HAPROXY_MUX_FCGI_T_H + +#include <haproxy/api-t.h> +#include <haproxy/show_flags-t.h> + +/**** FCGI connection flags (32 bit), in fcgi_conn->flags ****/ +#define FCGI_CF_NONE 0x00000000 + +/* Flags indicating why writing to the mux is blocked */ +#define FCGI_CF_MUX_MALLOC 0x00000001 /* mux is blocked on lack connection's mux buffer */ +#define FCGI_CF_MUX_MFULL 0x00000002 /* mux is blocked on connection's mux buffer full */ +#define FCGI_CF_MUX_BLOCK_ANY 0x00000003 /* mux is blocked on connection's mux buffer full */ + +/* Flags indicating why writing to the demux is blocked. + * The first two ones directly affect the ability for the mux to receive data + * from the connection. The other ones affect the mux's ability to demux + * received data. + */ +#define FCGI_CF_DEM_DALLOC 0x00000004 /* demux blocked on lack of connection's demux buffer */ +#define FCGI_CF_DEM_DFULL 0x00000008 /* demux blocked on connection's demux buffer full */ +#define FCGI_CF_DEM_MROOM 0x00000010 /* demux blocked on lack of room in mux buffer */ +#define FCGI_CF_DEM_SALLOC 0x00000020 /* demux blocked on lack of stream's rx buffer */ +#define FCGI_CF_DEM_SFULL 0x00000040 /* demux blocked on stream request buffer full */ +#define FCGI_CF_DEM_TOOMANY 0x00000080 /* demux blocked waiting for some stream connectors to leave */ +#define FCGI_CF_DEM_BLOCK_ANY 0x000000F0 /* aggregate of the demux flags above except DALLOC/DFULL */ + +/* Other flags */ +#define FCGI_CF_MPXS_CONNS 0x00000100 /* connection multiplexing is supported */ +#define FCGI_CF_ABRTS_SENT 0x00000200 /* a record ABORT was successfully sent to all active streams */ +#define FCGI_CF_ABRTS_FAILED 0x00000400 /* failed to abort processing of all streams */ +#define FCGI_CF_WAIT_FOR_HS 0x00000800 /* We did check that at least a stream was waiting for handshake */ +#define FCGI_CF_KEEP_CONN 0x00001000 /* HAProxy is responsible to close the connection */ +#define FCGI_CF_GET_VALUES 0x00002000 /* retrieve settings */ + +#define FCGI_CF_EOS 0x00004000 /* End-of-stream seen on the H1 connection (read0 detected) */ +#define FCGI_CF_ERR_PENDING 0x00008000 /* A write error was detected (block sends but not reads) */ +#define FCGI_CF_ERROR 0x00010000 /* A read error was detected (handled has an abort) */ + + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *fconn_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(FCGI_CF_MUX_MALLOC, _(FCGI_CF_MUX_MFULL, + _(FCGI_CF_DEM_DALLOC, _(FCGI_CF_DEM_DFULL, _(FCGI_CF_DEM_MROOM, + _(FCGI_CF_DEM_SALLOC, _(FCGI_CF_DEM_SFULL, _(FCGI_CF_DEM_TOOMANY, + _(FCGI_CF_MPXS_CONNS, _(FCGI_CF_ABRTS_SENT, _(FCGI_CF_ABRTS_FAILED, + _(FCGI_CF_WAIT_FOR_HS, _(FCGI_CF_KEEP_CONN, _(FCGI_CF_GET_VALUES, + _(FCGI_CF_EOS, _(FCGI_CF_ERR_PENDING, _(FCGI_CF_ERROR))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/**** FCGI stream flags (32 bit), in fcgi_strm->flags ****/ +#define FCGI_SF_NONE 0x00000000 +#define FCGI_SF_ES_RCVD 0x00000001 /* end-of-stream received (empty STDOUT or EDN_REQUEST record) */ +#define FCGI_SF_ES_SENT 0x00000002 /* end-of-stream sent (empty STDIN record) */ +#define FCGI_SF_EP_SENT 0x00000004 /* end-of-param sent (empty PARAMS record) */ +#define FCGI_SF_ABRT_SENT 0x00000008 /* abort sent (ABORT_REQUEST record) */ + +/* Stream flags indicating the reason the stream is blocked */ +#define FCGI_SF_BLK_MBUSY 0x00000010 /* blocked waiting for mux access (transient) */ +#define FCGI_SF_BLK_MROOM 0x00000020 /* blocked waiting for room in the mux */ +#define FCGI_SF_BLK_ANY 0x00000030 /* any of the reasons above */ + +#define FCGI_SF_BEGIN_SENT 0x00000100 /* a BEGIN_REQUEST record was sent for this stream */ +#define FCGI_SF_OUTGOING_DATA 0x00000200 /* set whenever we've seen outgoing data */ +#define FCGI_SF_NOTIFIED 0x00000400 /* a paused stream was notified to try to send again */ + +#define FCGI_SF_WANT_SHUTR 0x00001000 /* a stream couldn't shutr() (mux full/busy) */ +#define FCGI_SF_WANT_SHUTW 0x00002000 /* a stream couldn't shutw() (mux full/busy) */ + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *fstrm_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(FCGI_SF_ES_RCVD, _(FCGI_SF_ES_SENT, _(FCGI_SF_EP_SENT, _(FCGI_SF_ABRT_SENT, + _(FCGI_SF_BLK_MBUSY, _(FCGI_SF_BLK_MROOM, + _(FCGI_SF_BEGIN_SENT, _(FCGI_SF_OUTGOING_DATA, _(FCGI_SF_NOTIFIED, + _(FCGI_SF_WANT_SHUTR, _(FCGI_SF_WANT_SHUTW))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* FCGI connection state (fcgi_conn->state) */ +enum fcgi_conn_st { + FCGI_CS_INIT = 0, /* init done, waiting for sending GET_VALUES record */ + FCGI_CS_SETTINGS, /* GET_VALUES sent, waiting for the GET_VALUES_RESULT record */ + FCGI_CS_RECORD_H, /* GET_VALUES_RESULT received, waiting for a record header */ + FCGI_CS_RECORD_D, /* Record header OK, waiting for a record data */ + FCGI_CS_RECORD_P, /* Record processed, remains the padding */ + FCGI_CS_CLOSED, /* abort requests if necessary and close the connection ASAP */ + FCGI_CS_ENTRIES +} __attribute__((packed)); + +/* returns a fconn state as an abbreviated 3-letter string, or "???" if unknown */ +static inline const char *fconn_st_to_str(enum fcgi_conn_st st) +{ + switch (st) { + case FCGI_CS_INIT : return "INI"; + case FCGI_CS_SETTINGS : return "STG"; + case FCGI_CS_RECORD_H : return "RDH"; + case FCGI_CS_RECORD_D : return "RDD"; + case FCGI_CS_RECORD_P : return "RDP"; + case FCGI_CS_CLOSED : return "CLO"; + default : return "???"; + } +} + +/* FCGI stream state, in fcgi_strm->state */ +enum fcgi_strm_st { + FCGI_SS_IDLE = 0, + FCGI_SS_OPEN, + FCGI_SS_HREM, // half-closed(remote) + FCGI_SS_HLOC, // half-closed(local) + FCGI_SS_ERROR, + FCGI_SS_CLOSED, + FCGI_SS_ENTRIES +} __attribute__((packed)); + + +/* returns a fstrm state as an abbreviated 3-letter string, or "???" if unknown */ +static inline const char *fstrm_st_to_str(enum fcgi_strm_st st) +{ + switch (st) { + case FCGI_SS_IDLE : return "IDL"; + case FCGI_SS_OPEN : return "OPN"; + case FCGI_SS_HREM : return "RCL"; + case FCGI_SS_HLOC : return "HCL"; + case FCGI_SS_ERROR : return "ERR"; + case FCGI_SS_CLOSED : return "CLO"; + default : return "???"; + } +} + + +#endif /* _HAPROXY_MUX_FCGI_T_H */ diff --git a/include/haproxy/mux_h1-t.h b/include/haproxy/mux_h1-t.h new file mode 100644 index 0000000..2f49a49 --- /dev/null +++ b/include/haproxy/mux_h1-t.h @@ -0,0 +1,160 @@ +/* + * include/haproxy/mux_h1-t.h + * Definitions for basic H1 mux internal types, constants and flags. + * + * Copyright 2022 Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MUX_H1_T_H +#define _HAPROXY_MUX_H1_T_H + +#include <haproxy/api-t.h> +#include <haproxy/show_flags-t.h> + +/**** Connection flags (32 bit), in h1c->flags ****/ +#define H1C_F_NONE 0x00000000 + +/* Flags indicating why writing output data are blocked */ +#define H1C_F_OUT_ALLOC 0x00000001 /* mux is blocked on lack of output buffer */ +#define H1C_F_OUT_FULL 0x00000002 /* mux is blocked on output buffer full */ +/* 0x00000004 - 0x00000008 unused */ + +/* Flags indicating why reading input data are blocked. */ +#define H1C_F_IN_ALLOC 0x00000010 /* mux is blocked on lack of input buffer */ +#define H1C_F_IN_FULL 0x00000020 /* mux is blocked on input buffer full */ +#define H1C_F_IN_SALLOC 0x00000040 /* mux is blocked on lack of stream's request buffer */ +/* 0x00000080 unused */ + +#define H1C_F_EOS 0x00000100 /* End-of-stream seen on the H1 connection (read0 detected) */ +#define H1C_F_ERR_PENDING 0x00000200 /* A write error was detected (block sends but not reads) */ +#define H1C_F_ERROR 0x00000400 /* A read error was detected (handled has an abort) */ +#define H1C_F_SILENT_SHUT 0x00000800 /* if H1C is closed closed, silent (or dirty) shutdown must be performed */ +#define H1C_F_ABRT_PENDING 0x00001000 /* An error must be sent (previous attempt failed) and H1 connection must be closed ASAP */ +#define H1C_F_ABRTED 0x00002000 /* An error must be sent (previous attempt failed) and H1 connection must be closed ASAP */ +#define H1C_F_WANT_FASTFWD 0x00004000 /* Don't read into a buffer because we want to fast forward data */ +#define H1C_F_WAIT_NEXT_REQ 0x00008000 /* waiting for the next request to start, use keep-alive timeout */ +#define H1C_F_UPG_H2C 0x00010000 /* set if an upgrade to h2 should be done */ +#define H1C_F_CO_MSG_MORE 0x00020000 /* set if CO_SFL_MSG_MORE must be set when calling xprt->snd_buf() */ +#define H1C_F_CO_STREAMER 0x00040000 /* set if CO_SFL_STREAMER must be set when calling xprt->snd_buf() */ +#define H1C_F_CANT_FASTFWD 0x00080000 /* Fast-forwarding is not supported (exclusive with WANT_FASTFWD) */ + +/* 0x00100000 - 0x40000000 unused */ +#define H1C_F_IS_BACK 0x80000000 /* Set on outgoing connection */ + + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *h1c_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(H1C_F_OUT_ALLOC, _(H1C_F_OUT_FULL, + _(H1C_F_IN_ALLOC, _(H1C_F_IN_FULL, _(H1C_F_IN_SALLOC, + _(H1C_F_EOS, _(H1C_F_ERR_PENDING, _(H1C_F_ERROR, + _(H1C_F_SILENT_SHUT, _(H1C_F_ABRT_PENDING, _(H1C_F_ABRTED, + _(H1C_F_WANT_FASTFWD, _(H1C_F_WAIT_NEXT_REQ, _(H1C_F_UPG_H2C, _(H1C_F_CO_MSG_MORE, + _(H1C_F_CO_STREAMER, _(H1C_F_CANT_FASTFWD, _(H1C_F_IS_BACK)))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + + +/**** H1 stream flags (32 bit), in h1s->flags ****/ +#define H1S_F_NONE 0x00000000 + +#define H1S_F_RX_BLK 0x00100000 /* Don't process more input data, waiting sync with output side */ +#define H1S_F_TX_BLK 0x00200000 /* Don't process more output data, waiting sync with input side */ +#define H1S_F_RX_CONGESTED 0x00000004 /* Cannot process input data RX path is congested (waiting for more space in channel's buffer) */ + +/* 0x00000008 unused */ +#define H1S_F_WANT_KAL 0x00000010 +#define H1S_F_WANT_TUN 0x00000020 +#define H1S_F_WANT_CLO 0x00000040 +#define H1S_F_WANT_MSK 0x00000070 +#define H1S_F_NOT_FIRST 0x00000080 /* The H1 stream is not the first one */ +#define H1S_F_BODYLESS_RESP 0x00000100 /* Bodyless response message */ + +#define H1S_F_INTERNAL_ERROR 0x00000200 /* Set when an internal error occurred during the message parsing */ +#define H1S_F_NOT_IMPL_ERROR 0x00000400 /* Set when a feature is not implemented during the message parsing */ +#define H1S_F_PARSING_ERROR 0x00000800 /* Set when an error occurred during the message parsing */ +#define H1S_F_PROCESSING_ERROR 0x00001000 /* Set when an error occurred during the message xfer */ +#define H1S_F_ERROR_MASK 0x00003800 /* stream error mask */ + +#define H1S_F_HAVE_SRV_NAME 0x00002000 /* Set during output process if the server name header was added to the request */ +#define H1S_F_HAVE_O_CONN 0x00004000 /* Set during output process to know connection mode was processed */ +#define H1S_F_HAVE_WS_KEY 0x00008000 /* Set during output process to know WS key was found or generated */ +#define H1S_F_HAVE_CLEN 0x00010000 /* Set during output process to know C*L header was found or generated */ +#define H1S_F_HAVE_CHNK 0x00020000 /* Set during output process to know "T-E; chunk" header was found or generated */ + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *h1s_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(H1S_F_RX_BLK, _(H1S_F_TX_BLK, _(H1S_F_RX_CONGESTED, + _(H1S_F_WANT_KAL, _(H1S_F_WANT_TUN, _(H1S_F_WANT_CLO, + _(H1S_F_NOT_FIRST, _(H1S_F_BODYLESS_RESP, + _(H1S_F_INTERNAL_ERROR, _(H1S_F_NOT_IMPL_ERROR, _(H1S_F_PARSING_ERROR, _(H1S_F_PROCESSING_ERROR, + _(H1S_F_HAVE_SRV_NAME, _(H1S_F_HAVE_O_CONN, _(H1S_F_HAVE_WS_KEY, + _(H1S_F_HAVE_CLEN, _(H1S_F_HAVE_CHNK))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* H1 connection state, in h1c->state */ +enum h1_cs { + H1_CS_IDLE, /* IDLE connection. A freashly open or a reusable connection (H1S is NULL) */ + H1_CS_EMBRYONIC, /* Connection is waiting for the message headers (H1S is not NULL, not attached to a SC - Frontend connection only) */ + H1_CS_UPGRADING, /* TCP>H1 upgrade in-progress (H1S is not NULL and attached to a SC - Frontend connection only) */ + H1_CS_RUNNING, /* Connection fully established and the H1S is processing data (H1S is not NULL and attached to a SC) */ + H1_CS_CLOSING, /* Send pending outgoing data and close the connection ASAP (H1S may be NULL) */ + H1_CS_CLOSED, /* Connection must be closed now and H1C must be released (H1S is NULL) */ + H1_CS_ENTRIES, +} __attribute__((packed)); + + +/**** tiny state decoding functions for debug helpers ****/ + +/* returns a h1c state as an abbreviated 3-letter string, or "???" if unknown */ +static inline const char *h1c_st_to_str(enum h1_cs st) +{ + switch (st) { + case H1_CS_IDLE: return "IDL"; + case H1_CS_EMBRYONIC: return "EMB"; + case H1_CS_UPGRADING: return "UPG"; + case H1_CS_RUNNING: return "RUN"; + case H1_CS_CLOSING: return "CLI"; + case H1_CS_CLOSED: return "CLD"; + default: return "???"; + } +} + + +#endif /* _HAPROXY_MUX_H1_T_H */ diff --git a/include/haproxy/mux_h2-t.h b/include/haproxy/mux_h2-t.h new file mode 100644 index 0000000..ccb40b2 --- /dev/null +++ b/include/haproxy/mux_h2-t.h @@ -0,0 +1,222 @@ +/* + * include/haproxy/mux_h2-t.h + * Definitions for basic H2 mux internal types, constants and flags. + * + * Copyright 2017-2022 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_MUX_H2_T_H +#define _HAPROXY_MUX_H2_T_H + +#include <haproxy/api-t.h> +#include <haproxy/show_flags-t.h> + +/**** Connection flags (32 bit), in h2c->flags ****/ + +#define H2_CF_NONE 0x00000000 + +/* Flags indicating why writing to the mux is blocked. */ +#define H2_CF_MUX_MALLOC 0x00000001 // mux blocked on lack of connection's mux buffer +#define H2_CF_MUX_MFULL 0x00000002 // mux blocked on connection's mux buffer full +#define H2_CF_MUX_BLOCK_ANY 0x00000003 // aggregate of the mux flags above + +/* Flags indicating why writing to the demux is blocked. + * The first two ones directly affect the ability for the mux to receive data + * from the connection. The other ones affect the mux's ability to demux + * received data. + */ +#define H2_CF_DEM_DALLOC 0x00000004 // demux blocked on lack of connection's demux buffer +#define H2_CF_DEM_DFULL 0x00000008 // demux blocked on connection's demux buffer full + +#define H2_CF_WAIT_INLIST 0x00000010 // there is at least one stream blocked by another stream in send_list/fctl_list +#define H2_CF_DEM_MROOM 0x00000020 // demux blocked on lack of room in mux buffer +#define H2_CF_DEM_SALLOC 0x00000040 // demux blocked on lack of stream's request buffer +#define H2_CF_DEM_SFULL 0x00000080 // demux blocked on stream request buffer full +#define H2_CF_DEM_TOOMANY 0x00000100 // demux blocked waiting for some stream connectors to leave +#define H2_CF_DEM_BLOCK_ANY 0x000001E0 // aggregate of the demux flags above except DALLOC/DFULL + // (SHORT_READ is also excluded) + +#define H2_CF_DEM_SHORT_READ 0x00000200 // demux blocked on incomplete frame +#define H2_CF_DEM_IN_PROGRESS 0x00000400 // demux in progress (dsi,dfl,dft are valid) + +/* other flags */ +#define H2_CF_MBUF_HAS_DATA 0x00000800 // some stream data (data, headers) still in mbuf +#define H2_CF_GOAWAY_SENT 0x00001000 // a GOAWAY frame was successfully sent +#define H2_CF_GOAWAY_FAILED 0x00002000 // a GOAWAY frame failed to be sent +#define H2_CF_WAIT_FOR_HS 0x00004000 // We did check that at least a stream was waiting for handshake +#define H2_CF_IS_BACK 0x00008000 // this is an outgoing connection +#define H2_CF_WINDOW_OPENED 0x00010000 // demux increased window already advertised +#define H2_CF_RCVD_SHUT 0x00020000 // a recv() attempt already failed on a shutdown +#define H2_CF_END_REACHED 0x00040000 // pending data too short with RCVD_SHUT present + +#define H2_CF_RCVD_RFC8441 0x00100000 // settings from RFC8441 has been received indicating support for Extended CONNECT +#define H2_CF_SHTS_UPDATED 0x00200000 // SETTINGS_HEADER_TABLE_SIZE updated +#define H2_CF_DTSU_EMITTED 0x00400000 // HPACK Dynamic Table Size Update opcode emitted + +#define H2_CF_ERR_PENDING 0x00800000 // A write error was detected (block sends but not reads) +#define H2_CF_ERROR 0x01000000 //A read error was detected (handled has an abort) + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *h2c_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(H2_CF_MUX_MALLOC, _(H2_CF_MUX_MFULL, _(H2_CF_DEM_DALLOC, + _(H2_CF_DEM_DFULL, _(H2_CF_WAIT_INLIST, _(H2_CF_DEM_MROOM, + _(H2_CF_DEM_SALLOC, _(H2_CF_DEM_SFULL, _(H2_CF_DEM_TOOMANY, + _(H2_CF_DEM_SHORT_READ, _(H2_CF_DEM_IN_PROGRESS, _(H2_CF_MBUF_HAS_DATA, + _(H2_CF_GOAWAY_SENT, _(H2_CF_GOAWAY_FAILED, _(H2_CF_WAIT_FOR_HS, _(H2_CF_IS_BACK, + _(H2_CF_WINDOW_OPENED, _(H2_CF_RCVD_SHUT, _(H2_CF_END_REACHED, + _(H2_CF_RCVD_RFC8441, _(H2_CF_SHTS_UPDATED, _(H2_CF_DTSU_EMITTED, + _(H2_CF_ERR_PENDING, _(H2_CF_ERROR)))))))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + + +/**** HTTP/2 stream flags (32 bit), in h2s->flags ****/ + +#define H2_SF_NONE 0x00000000 +#define H2_SF_ES_RCVD 0x00000001 +#define H2_SF_ES_SENT 0x00000002 + +#define H2_SF_RST_RCVD 0x00000004 // received RST_STREAM +#define H2_SF_RST_SENT 0x00000008 // sent RST_STREAM + +/* stream flags indicating the reason the stream is blocked */ +#define H2_SF_BLK_MBUSY 0x00000010 // blocked waiting for mux access (transient) +#define H2_SF_BLK_MROOM 0x00000020 // blocked waiting for room in the mux (must be in send list) +#define H2_SF_BLK_MFCTL 0x00000040 // blocked due to mux fctl (must be in fctl list) +#define H2_SF_BLK_SFCTL 0x00000080 // blocked due to stream fctl (must be in blocked list) +#define H2_SF_BLK_ANY 0x000000F0 // any of the reasons above + +/* stream flags indicating how data is supposed to be sent */ +#define H2_SF_DATA_CLEN 0x00000100 // data sent using content-length +#define H2_SF_BODYLESS_RESP 0x00000200 /* Bodyless response message */ +#define H2_SF_BODY_TUNNEL 0x00000400 // Attempt to establish a Tunnelled stream (the result depends on the status code) + +#define H2_SF_NOTIFIED 0x00000800 // a paused stream was notified to try to send again +#define H2_SF_HEADERS_SENT 0x00001000 // a HEADERS frame was sent for this stream +#define H2_SF_OUTGOING_DATA 0x00002000 // set whenever we've seen outgoing data + +#define H2_SF_HEADERS_RCVD 0x00004000 // a HEADERS frame was received for this stream + +#define H2_SF_WANT_SHUTR 0x00008000 // a stream couldn't shutr() (mux full/busy) +#define H2_SF_WANT_SHUTW 0x00010000 // a stream couldn't shutw() (mux full/busy) + +#define H2_SF_EXT_CONNECT_SENT 0x00040000 // rfc 8441 an Extended CONNECT has been sent +#define H2_SF_EXT_CONNECT_RCVD 0x00080000 // rfc 8441 an Extended CONNECT has been received and parsed + +#define H2_SF_TUNNEL_ABRT 0x00100000 // A tunnel attempt was aborted +#define H2_SF_MORE_HTX_DATA 0x00200000 // more data expected from HTX + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *h2s_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(H2_SF_ES_RCVD, _(H2_SF_ES_SENT, _(H2_SF_RST_RCVD, _(H2_SF_RST_SENT, + _(H2_SF_BLK_MBUSY, _(H2_SF_BLK_MROOM, _(H2_SF_BLK_MFCTL, + _(H2_SF_BLK_SFCTL, _(H2_SF_DATA_CLEN, _(H2_SF_BODYLESS_RESP, + _(H2_SF_BODY_TUNNEL, _(H2_SF_NOTIFIED, _(H2_SF_HEADERS_SENT, + _(H2_SF_OUTGOING_DATA, _(H2_SF_HEADERS_RCVD, _(H2_SF_WANT_SHUTR, + _(H2_SF_WANT_SHUTW, _(H2_SF_EXT_CONNECT_SENT, _(H2_SF_EXT_CONNECT_RCVD, + _(H2_SF_TUNNEL_ABRT, _(H2_SF_MORE_HTX_DATA))))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + + +/* H2 connection state, in h2c->st0 */ +enum h2_cs { + H2_CS_PREFACE, // init done, waiting for connection preface + H2_CS_SETTINGS1, // preface OK, waiting for first settings frame + H2_CS_FRAME_H, // first settings frame ok, waiting for frame header + H2_CS_FRAME_P, // frame header OK, waiting for frame payload + H2_CS_FRAME_A, // frame payload OK, trying to send ACK frame + H2_CS_FRAME_E, // frame payload OK, trying to send RST frame + H2_CS_ERROR, // send GOAWAY(errcode) and close the connection ASAP + H2_CS_ERROR2, // GOAWAY(errcode) sent, close the connection ASAP + H2_CS_ENTRIES // must be last +} __attribute__((packed)); + +/* H2 stream state, in h2s->st */ +enum h2_ss { + H2_SS_IDLE = 0, // idle + H2_SS_RLOC, // reserved(local) + H2_SS_RREM, // reserved(remote) + H2_SS_OPEN, // open + H2_SS_HREM, // half-closed(remote) + H2_SS_HLOC, // half-closed(local) + H2_SS_ERROR, // an error needs to be sent using RST_STREAM + H2_SS_CLOSED, // closed + H2_SS_ENTRIES // must be last +} __attribute__((packed)); + + +/* 32 buffers: one for the ring's root, rest for the mbuf itself */ +#define H2C_MBUF_CNT 32 + +/**** tiny state decoding functions for debug helpers ****/ + +/* returns a h2c state as an abbreviated 3-letter string, or "???" if unknown */ +static inline const char *h2c_st_to_str(enum h2_cs st) +{ + switch (st) { + case H2_CS_PREFACE: return "PRF"; + case H2_CS_SETTINGS1: return "STG"; + case H2_CS_FRAME_H: return "FRH"; + case H2_CS_FRAME_P: return "FRP"; + case H2_CS_FRAME_A: return "FRA"; + case H2_CS_FRAME_E: return "FRE"; + case H2_CS_ERROR: return "ERR"; + case H2_CS_ERROR2: return "ER2"; + default: return "???"; + } +} + +/* returns a h2s state as an abbreviated 3-letter string, or "???" if unknown */ +static inline const char *h2s_st_to_str(enum h2_ss st) +{ + switch (st) { + case H2_SS_IDLE: return "IDL"; // idle + case H2_SS_RLOC: return "RSL"; // reserved local + case H2_SS_RREM: return "RSR"; // reserved remote + case H2_SS_OPEN: return "OPN"; // open + case H2_SS_HREM: return "HCR"; // half-closed remote + case H2_SS_HLOC: return "HCL"; // half-closed local + case H2_SS_ERROR : return "ERR"; // error + case H2_SS_CLOSED: return "CLO"; // closed + default: return "???"; + } +} + +#endif /* _HAPROXY_MUX_H2_T_H */ diff --git a/include/haproxy/mux_quic-t.h b/include/haproxy/mux_quic-t.h new file mode 100644 index 0000000..abfc20a --- /dev/null +++ b/include/haproxy/mux_quic-t.h @@ -0,0 +1,204 @@ +#ifndef _HAPROXY_MUX_QUIC_T_H +#define _HAPROXY_MUX_QUIC_T_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <import/ebtree-t.h> + +#include <haproxy/buf-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/htx-t.h> +#include <haproxy/list-t.h> +#include <haproxy/ncbuf-t.h> +#include <haproxy/quic_frame-t.h> +#include <haproxy/quic_stream-t.h> +#include <haproxy/stconn-t.h> + +/* Stream types */ +enum qcs_type { + QCS_CLT_BIDI, + QCS_SRV_BIDI, + QCS_CLT_UNI, + QCS_SRV_UNI, + + /* Must be the last one */ + QCS_MAX_TYPES +}; + +#define QC_CF_ERRL 0x00000001 /* fatal error detected locally, connection should be closed soon */ +#define QC_CF_ERRL_DONE 0x00000002 /* local error properly handled, connection can be released */ +#define QC_CF_BLK_MFCTL 0x00000004 /* sending blocked due to connection flow-control */ +#define QC_CF_CONN_FULL 0x00000008 /* no stream buffers available on connection */ +#define QC_CF_APP_SHUT 0x00000010 /* Application layer shutdown done. */ +#define QC_CF_ERR_CONN 0x00000020 /* fatal error reported by transport layer */ + +struct qcc { + struct connection *conn; + uint64_t nb_sc; /* number of attached stream connectors */ + uint64_t nb_hreq; /* number of in-progress http requests */ + uint32_t flags; /* QC_CF_* */ + + /* flow-control fields set by us enforced on our side. */ + struct { + struct list frms; /* prepared frames related to flow-control */ + + uint64_t ms_bidi_init; /* max initial sub-ID of bidi stream allowed for the peer */ + uint64_t ms_bidi; /* max sub-ID of bidi stream allowed for the peer */ + uint64_t cl_bidi_r; /* total count of closed remote bidi stream since last MAX_STREAMS emission */ + + uint64_t ms_uni; /* max sub-ID of uni stream allowed for the peer */ + + uint64_t msd_bidi_l; /* initial max-stream-data on local bidi streams */ + uint64_t msd_bidi_r; /* initial max-stream-data on remote bidi streams */ + uint64_t msd_uni_r; /* initial max-stream-data on remote uni streams */ + + uint64_t md; /* current max-data allowed for the peer */ + uint64_t md_init; /* initial max-data */ + uint64_t offsets_recv; /* sum of offsets received */ + uint64_t offsets_consume; /* sum of offsets consumed */ + } lfctl; + + /* flow-control fields set by the peer which we must respect. */ + struct { + uint64_t md; /* connection flow control limit updated on MAX_DATA frames reception */ + uint64_t msd_bidi_l; /* initial max-stream-data from peer on local bidi streams */ + uint64_t msd_bidi_r; /* initial max-stream-data from peer on remote bidi streams */ + uint64_t msd_uni_l; /* initial max-stream-data from peer on local uni streams */ + } rfctl; + + struct { + uint64_t offsets; /* sum of all offsets prepared */ + uint64_t sent_offsets; /* sum of all offset sent */ + } tx; + + uint64_t largest_bidi_r; /* largest remote bidi stream ID opened. */ + uint64_t largest_uni_r; /* largest remote uni stream ID opened. */ + uint64_t next_bidi_l; /* next stream ID to use for local bidi stream */ + uint64_t next_uni_l; /* next stream ID to use for local uni stream */ + + struct eb_root streams_by_id; /* all active streams by their ID */ + + struct list send_retry_list; /* list of qcs eligible to send retry */ + struct list send_list; /* list of qcs ready to send (STREAM, STOP_SENDING or RESET_STREAM emission) */ + + struct wait_event wait_event; /* To be used if we're waiting for I/Os */ + + struct proxy *proxy; + + /* haproxy timeout management */ + struct task *task; + struct list opening_list; /* list of not already attached streams (http-request timeout) */ + int timeout; + int shut_timeout; + int idle_start; /* base time for http-keep-alive timeout */ + struct quic_err err; /* code for locally detected error */ + + const struct qcc_app_ops *app_ops; + void *ctx; /* Application layer context */ +}; + +#define QC_SF_NONE 0x00000000 +#define QC_SF_SIZE_KNOWN 0x00000001 /* last frame received for this stream */ +#define QC_SF_FIN_STREAM 0x00000002 /* FIN bit must be set for last frame of the stream */ +#define QC_SF_BLK_MROOM 0x00000004 /* app layer is blocked waiting for room in the qcs.tx.buf */ +#define QC_SF_DETACH 0x00000008 /* sc is detached but there is remaining data to send */ +#define QC_SF_BLK_SFCTL 0x00000010 /* stream blocked due to stream flow control limit */ +#define QC_SF_DEM_FULL 0x00000020 /* demux blocked on request channel buffer full */ +#define QC_SF_READ_ABORTED 0x00000040 /* Rx closed using STOP_SENDING*/ +#define QC_SF_TO_RESET 0x00000080 /* a RESET_STREAM must be sent */ +#define QC_SF_HREQ_RECV 0x00000100 /* a full HTTP request has been received */ +#define QC_SF_TO_STOP_SENDING 0x00000200 /* a STOP_SENDING must be sent */ +#define QC_SF_UNKNOWN_PL_LENGTH 0x00000400 /* HTX EOM may be missing from the stream layer */ +#define QC_SF_RECV_RESET 0x00000800 /* a RESET_STREAM was received */ + +/* Maximum size of stream Rx buffer. */ +#define QC_S_RX_BUF_SZ (global.tune.bufsize - NCB_RESERVED_SZ) + +/* QUIC stream states + * + * On initialization a stream is put on idle state. It is opened as soon as + * data has been successfully sent or received on it. + * + * A bidirectional stream has two channels which can be closed separately. The + * local channel is closed when the STREAM frame with FIN or a RESET_STREAM has + * been emitted. The remote channel is closed as soon as all data from the peer + * has been received. The stream goes instantely to the close state once both + * channels are closed. + * + * A unidirectional stream has only one channel of communication. Thus, it does + * not use half closed states and transition directly from open to close state. + */ +enum qcs_state { + QC_SS_IDLE = 0, /* initial state */ + QC_SS_OPEN, /* opened */ + QC_SS_HLOC, /* half-closed local */ + QC_SS_HREM, /* half-closed remote */ + QC_SS_CLO, /* closed */ +} __attribute__((packed)); + +struct qcs { + struct qcc *qcc; + struct sedesc *sd; + uint32_t flags; /* QC_SF_* */ + enum qcs_state st; /* QC_SS_* state */ + void *ctx; /* app-ops context */ + + struct { + uint64_t offset; /* absolute current base offset of ncbuf */ + uint64_t offset_max; /* maximum absolute offset received */ + struct ncbuf ncbuf; /* receive buffer - can handle out-of-order offset frames */ + struct buffer app_buf; /* receive buffer used by stconn layer */ + uint64_t msd; /* current max-stream-data limit to enforce */ + uint64_t msd_init; /* initial max-stream-data */ + } rx; + struct { + uint64_t offset; /* last offset of data ready to be sent */ + uint64_t sent_offset; /* last offset sent by transport layer */ + struct buffer buf; /* transmit buffer before sending via xprt */ + uint64_t msd; /* fctl bytes limit to respect on emission */ + } tx; + + struct eb64_node by_id; + uint64_t id; + struct qc_stream_desc *stream; + + struct list el; /* element of qcc.send_retry_list */ + struct list el_send; /* element of qcc.send_list */ + struct list el_opening; /* element of qcc.opening_list */ + + struct wait_event wait_event; + struct wait_event *subs; + + uint64_t err; /* error code to transmit via RESET_STREAM */ + + int start; /* base timestamp for http-request timeout */ +}; + +/* Used as qcc_app_ops.close callback argument. */ +enum qcc_app_ops_close_side { + QCC_APP_OPS_CLOSE_SIDE_RD, /* Read channel closed (RESET_STREAM received). */ + QCC_APP_OPS_CLOSE_SIDE_WR /* Write channel closed (STOP_SENDING received). */ +}; + +/* QUIC application layer operations */ +struct qcc_app_ops { + int (*init)(struct qcc *qcc); + int (*attach)(struct qcs *qcs, void *conn_ctx); + ssize_t (*decode_qcs)(struct qcs *qcs, struct buffer *b, int fin); + size_t (*snd_buf)(struct qcs *qcs, struct buffer *buf, size_t count); + size_t (*nego_ff)(struct qcs *qcs, size_t count); + size_t (*done_ff)(struct qcs *qcs); + int (*close)(struct qcs *qcs, enum qcc_app_ops_close_side side); + void (*detach)(struct qcs *qcs); + int (*finalize)(void *ctx); + void (*shutdown)(void *ctx); /* Close a connection. */ + void (*release)(void *ctx); + void (*inc_err_cnt)(void *ctx, int err_code); +}; + +#endif /* USE_QUIC */ + +#endif /* _HAPROXY_MUX_QUIC_T_H */ diff --git a/include/haproxy/mux_quic.h b/include/haproxy/mux_quic.h new file mode 100644 index 0000000..872c5ea --- /dev/null +++ b/include/haproxy/mux_quic.h @@ -0,0 +1,116 @@ +#ifndef _HAPROXY_MUX_QUIC_H +#define _HAPROXY_MUX_QUIC_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/list.h> +#include <haproxy/mux_quic-t.h> +#include <haproxy/stconn.h> + +void qcc_set_error(struct qcc *qcc, int err, int app); +struct qcs *qcc_init_stream_local(struct qcc *qcc, int bidi); +struct stconn *qcs_attach_sc(struct qcs *qcs, struct buffer *buf, char fin); +int qcs_is_close_local(struct qcs *qcs); +int qcs_is_close_remote(struct qcs *qcs); +struct buffer *qcs_get_buf(struct qcs *qcs, struct buffer *bptr); + +int qcs_subscribe(struct qcs *qcs, int event_type, struct wait_event *es); +void qcs_notify_recv(struct qcs *qcs); +void qcs_notify_send(struct qcs *qcs); + +void qcc_emit_cc_app(struct qcc *qcc, int err, int immediate); +void qcc_reset_stream(struct qcs *qcs, int err); +void qcc_send_stream(struct qcs *qcs, int urg); +void qcc_abort_stream_read(struct qcs *qcs); +int qcc_recv(struct qcc *qcc, uint64_t id, uint64_t len, uint64_t offset, + char fin, char *data); +int qcc_recv_max_data(struct qcc *qcc, uint64_t max); +int qcc_recv_max_stream_data(struct qcc *qcc, uint64_t id, uint64_t max); +int qcc_recv_reset_stream(struct qcc *qcc, uint64_t id, uint64_t err, uint64_t final_size); +int qcc_recv_stop_sending(struct qcc *qcc, uint64_t id, uint64_t err); +void qcc_streams_sent_done(struct qcs *qcs, uint64_t data, uint64_t offset); + +/* Bit shift to get the stream sub ID for internal use which is obtained + * shifting the stream IDs by this value, knowing that the + * QCS_ID_TYPE_SHIFT less significant bits identify the stream ID + * types (client initiated bidirectional, server initiated bidirectional, + * client initiated unidirectional, server initiated bidirectional). + * Note that there is no reference to such stream sub IDs in the RFC. + */ +#define QCS_ID_TYPE_MASK 0x3 +#define QCS_ID_TYPE_SHIFT 2 +/* The less significant bit of a stream ID is set for a server initiated stream */ +#define QCS_ID_SRV_INTIATOR_BIT 0x1 +/* This bit is set for unidirectional streams */ +#define QCS_ID_DIR_BIT 0x2 + +static inline enum qcs_type qcs_id_type(uint64_t id) +{ + return id & QCS_ID_TYPE_MASK; +} + +/* Return true if stream has been opened locally. */ +static inline int quic_stream_is_local(struct qcc *qcc, uint64_t id) +{ + return conn_is_back(qcc->conn) == !(id & QCS_ID_SRV_INTIATOR_BIT); +} + +/* Return true if stream is opened by peer. */ +static inline int quic_stream_is_remote(struct qcc *qcc, uint64_t id) +{ + return !quic_stream_is_local(qcc, id); +} + +static inline int quic_stream_is_uni(uint64_t id) +{ + return id & QCS_ID_DIR_BIT; +} + +static inline int quic_stream_is_bidi(uint64_t id) +{ + return !quic_stream_is_uni(id); +} + +static inline char *qcs_st_to_str(enum qcs_state st) +{ + switch (st) { + case QC_SS_IDLE: return "IDL"; + case QC_SS_OPEN: return "OPN"; + case QC_SS_HLOC: return "HCL"; + case QC_SS_HREM: return "HCR"; + case QC_SS_CLO: return "CLO"; + default: return "???"; + } +} + +int qcc_install_app_ops(struct qcc *qcc, const struct qcc_app_ops *app_ops); + +/* Register <qcs> stream for http-request timeout. If the stream is not yet + * attached in the configured delay, qcc timeout task will be triggered. This + * means the full header section was not received in time. + * + * This function should be called by the application protocol layer on request + * streams initialization. + */ +static inline void qcs_wait_http_req(struct qcs *qcs) +{ + struct qcc *qcc = qcs->qcc; + + /* A stream cannot be registered several times. */ + BUG_ON_HOT(tick_isset(qcs->start)); + qcs->start = now_ms; + + /* qcc.opening_list size is limited by flow-control so no custom + * restriction is needed here. + */ + LIST_APPEND(&qcc->opening_list, &qcs->el_opening); +} + +#endif /* USE_QUIC */ + +#endif /* _HAPROXY_MUX_QUIC_H */ diff --git a/include/haproxy/mworker-t.h b/include/haproxy/mworker-t.h new file mode 100644 index 0000000..3137ec0 --- /dev/null +++ b/include/haproxy/mworker-t.h @@ -0,0 +1,51 @@ +/* + * include/haproxy/mworker-t.h + * Master Worker type definitions. + * + * Copyright HAProxy Technologies 2019 - William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_MWORKER_T_H_ +#define _HAPROXY_MWORKER_T_H_ + +#include <haproxy/list.h> +#include <haproxy/signal-t.h> + +/* options for mworker_proc */ + +#define PROC_O_TYPE_MASTER 0x00000001 +#define PROC_O_TYPE_WORKER 0x00000002 +#define PROC_O_TYPE_PROG 0x00000004 +/* 0x00000008 unused */ +#define PROC_O_LEAVING 0x00000010 /* this process should be leaving */ +/* 0x00000020 to 0x00000080 unused */ +#define PROC_O_START_RELOAD 0x00000100 /* Start the process even if the master was re-executed */ + +/* + * Structure used to describe the processes in master worker mode + */ +struct server; +struct mworker_proc { + int pid; + int options; + char *id; + char **command; + char *path; + char *version; + int ipc_fd[2]; /* 0 is master side, 1 is worker side */ + int reloads; + int failedreloads; /* number of failed reloads since the last successful one */ + int timestamp; + struct server *srv; /* the server entry in the master proxy */ + struct list list; + int uid; + int gid; +}; + +#endif /* _HAPROXY_MWORKER_T_H_ */ diff --git a/include/haproxy/mworker.h b/include/haproxy/mworker.h new file mode 100644 index 0000000..c9dd840 --- /dev/null +++ b/include/haproxy/mworker.h @@ -0,0 +1,48 @@ +/* + * include/haproxy/mworker-t.h + * Master Worker function prototypes. + * + * Copyright HAProxy Technologies 2019 - William Lallemand <wlallemand@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_MWORKER_H_ +#define _HAPROXY_MWORKER_H_ + +#include <haproxy/mworker-t.h> +#include <haproxy/signal-t.h> + +extern struct mworker_proc *proc_self; + +void mworker_proc_list_to_env(void); +int mworker_env_to_proc_list(void); + + +void mworker_block_signals(void); +void mworker_unblock_signals(void); + +void mworker_broadcast_signal(struct sig_handler *sh); +void mworker_catch_sighup(struct sig_handler *sh); +void mworker_catch_sigterm(struct sig_handler *sh); +void mworker_catch_sigchld(struct sig_handler *sh); + +void mworker_accept_wrapper(int fd); + +void mworker_cleanlisteners(void); + +int mworker_child_nb(void); + +int mworker_ext_launch_all(void); + +void mworker_kill_max_reloads(int sig); + +struct mworker_proc *mworker_proc_new(); +void mworker_free_child(struct mworker_proc *); +void mworker_cleanup_proc(); + +#endif /* _HAPROXY_MWORKER_H_ */ diff --git a/include/haproxy/namespace-t.h b/include/haproxy/namespace-t.h new file mode 100644 index 0000000..fe46577 --- /dev/null +++ b/include/haproxy/namespace-t.h @@ -0,0 +1,39 @@ +/* + * include/haproxy/namespace-t.h + * Linux network namespaces types definitions + * + * Copyright (C) 2014 Tamas Kovacs, Sarkozi Laszlo, Krisztian Kovacs + * Copyright (C) 2015-2020 Willy Tarreau + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_NAMESPACE_T_H +#define _HAPROXY_NAMESPACE_T_H + +#include <import/ebtree-t.h> +#include <haproxy/api-t.h> + +/* the struct is just empty if namespaces are not supported */ +struct netns_entry +{ +#ifdef USE_NS + struct ebpt_node node; + size_t name_len; + int fd; +#endif +}; + +#endif /* _HAPROXY_NAMESPACE_T_H */ diff --git a/include/haproxy/namespace.h b/include/haproxy/namespace.h new file mode 100644 index 0000000..2d6b6f8 --- /dev/null +++ b/include/haproxy/namespace.h @@ -0,0 +1,47 @@ +/* + * include/haproxy/namespace.h + * Linux network namespaces management + * + * Copyright (C) 2014 Tamas Kovacs, Sarkozi Laszlo, Krisztian Kovacs + * Copyright (C) 2015-2020 Willy Tarreau + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_NAMESPACE_H +#define _HAPROXY_NAMESPACE_H + +#include <sys/types.h> +#include <sys/socket.h> +#include <import/ebistree.h> +#include <haproxy/namespace-t.h> + +#ifdef USE_NS + +int my_socketat(const struct netns_entry *ns, int domain, int type, int protocol); +struct netns_entry* netns_store_insert(const char *ns_name); +const struct netns_entry* netns_store_lookup(const char *ns_name, size_t ns_name_len); +int netns_init(void); + +#else /* no namespace support */ + +static inline int my_socketat(const struct netns_entry *ns, int domain, int type, int protocol) +{ + return socket(domain, type, protocol); +} + +#endif /* USE_NS */ + +#endif /* _HAPROXY_NAMESPACE_H */ diff --git a/include/haproxy/ncbuf-t.h b/include/haproxy/ncbuf-t.h new file mode 100644 index 0000000..0dd958f --- /dev/null +++ b/include/haproxy/ncbuf-t.h @@ -0,0 +1,104 @@ +#ifndef _HAPROXY_NCBUF_T_H +#define _HAPROXY_NCBUF_T_H + +/* **** public documentation **** + * + * <ncbuf> stands for non-contiguous circular buffer. This type can be used to + * store data in a non-linear way with gaps between them. The buffer is + * circular and so data may wrapped. + * + * The API of <ncbuf> is split in two parts. Please refer to the public API + * declared in this header file which should cover all the needs. + * + * To minimize the memory footprint, size of data and gaps are inserted in the + * gaps themselves. This way <ncbuf> does not need to maintain a separate list + * of data offsets in a dedicated structure. However, this put some limitations + * on the buffer usage that the user need to know. + * + * First, a space will always be reserved in the allocated buffer area to store + * the size of the first data block. Use ncb_size(buf) to retrieve the usable + * size of the allocated buffer excluding the reserved space. + * + * Second, add and deletion operations are constraint and may be impossible if + * a minimal gap size between data is not respected. A caller must always + * inspect the return values of these functions. To limit these errors and + * improve the buffer performance, <ncbuf> should be reserved for use-cases + * where the number of formed gaps is kept minimal and evenly spread. + */ + +/* **** internal documentation **** + * + * This section is useful to users who need to understand how ncbuf are + * implemented. + * + * Public and internal functions all shared a common abstraction of the buffer. + * The buffer content is represented as a list of blocks, alternating between + * DATA and GAP blocks. This simplifies the buffer examination loop and + * insertion/deletion. Note that this list of blocks is not stored in the + * buffer structure. + * + * The buffer is considered to always start with a DATA block. The size of this + * block is stored just before <head> which is the pointer for offset 0. This + * space will always be reserved for this usage. It can be accessed through + * ncb_int_head(buf). If the buffer has no data at head, the reserved space + * will simply contains the value 0, and will be follow by a gap. + * + * A gap always contains the size of the gap itself and the size of the next + * data block. Here is a small representation of a gap stored at offset <x> + * before a data block at offset <y>. + * + * x y + * ------------------------------------------------------------ + * xxxxxx| GAP-SZ | DATA-SZ | | xxxxxxxxxxxxx... + * ------------------------------------------------------------ + * | -------- GAP-SZ -------------- > | --- DATA-SZ ---> + * + * This means that a gap must be at least big enough to store two sizes. + * However, there is an optimization when the last block of the buffer is a + * gap. In this case, there is no minimal size for this block. If the gap is + * too small, the two sizes won't be stored in it. This block is considered + * to be a reduced gap. The block API will detect such a gap if stored at an + * offset near the end of the buffer. + * + */ + +#include <inttypes.h> + +/* ncb_sz_t is the basic type used in ncbuf to represent data and gap sizes. + * Use a bigger type to extend the maximum data size supported in the buffer. + * On the other hand, this also increases the minimal gap size which can + * cause more rejection for add/delete operations. + */ +typedef uint32_t ncb_sz_t; + +/* reserved size before head used to store first data block size */ +#define NCB_RESERVED_SZ (sizeof(ncb_sz_t)) + +/* A gap contains its size and the size of the data following it. */ +#define NCB_GAP_MIN_SZ (sizeof(ncb_sz_t) * 2) +#define NCB_GAP_SZ_OFF 0 +#define NCB_GAP_SZ_DATA_OFF (sizeof(ncb_sz_t)) + +#define NCBUF_NULL ((struct ncbuf){ }) + +struct ncbuf { + char *area; + ncb_sz_t size; + ncb_sz_t head; +}; + +enum ncb_ret { + NCB_RET_OK = 0, /* no error */ + + NCB_RET_GAP_SIZE, /* operation would create a too small gap */ + NCB_RET_DATA_REJ, /* operation would overwrite data with different one */ +}; + +/* Define how insert is conducted in regards with already stored data. */ +enum ncb_add_mode { + NCB_ADD_PRESERVE, /* keep the already stored data and only insert in gaps */ + NCB_ADD_OVERWRT, /* overwrite old data with new ones */ + NCB_ADD_COMPARE, /* compare before insert : if new data are different do not proceed */ +}; + +#endif /* _HAPROXY_NCBUF_T_H */ diff --git a/include/haproxy/ncbuf.h b/include/haproxy/ncbuf.h new file mode 100644 index 0000000..8972793 --- /dev/null +++ b/include/haproxy/ncbuf.h @@ -0,0 +1,54 @@ +#ifndef _HAPROXY_NCBUF_H +#define _HAPROXY_NCBUF_H + +#include <haproxy/ncbuf-t.h> + +static inline int ncb_is_null(const struct ncbuf *buf) +{ + return buf->size == 0; +} + +void ncb_init(struct ncbuf *buf, ncb_sz_t head); +struct ncbuf ncb_make(char *area, ncb_sz_t size, ncb_sz_t head); + +/* Returns start of allocated buffer area. */ +static inline char *ncb_orig(const struct ncbuf *buf) +{ + return buf->area; +} + +/* Returns current head pointer into buffer area. */ +static inline char *ncb_head(const struct ncbuf *buf) +{ + return buf->area + buf->head; +} + +/* Returns the first byte after the allocated buffer area. */ +static inline char *ncb_wrap(const struct ncbuf *buf) +{ + return buf->area + buf->size; +} + +/* Returns the usable size of <buf> for data storage. This is the size of the + * allocated buffer without the reserved header space. + */ +static inline ncb_sz_t ncb_size(const struct ncbuf *buf) +{ + if (ncb_is_null(buf)) + return 0; + + return buf->size - NCB_RESERVED_SZ; +} + +ncb_sz_t ncb_total_data(const struct ncbuf *buf); +int ncb_is_empty(const struct ncbuf *buf); +int ncb_is_full(const struct ncbuf *buf); +int ncb_is_fragmented(const struct ncbuf *buf); + +ncb_sz_t ncb_data(const struct ncbuf *buf, ncb_sz_t offset); + +enum ncb_ret ncb_add(struct ncbuf *buf, ncb_sz_t off, + const char *data, ncb_sz_t len, enum ncb_add_mode mode); +enum ncb_ret ncb_advance(struct ncbuf *buf, ncb_sz_t adv); + +#endif /* _HAPROXY_NCBUF_H */ diff --git a/include/haproxy/net_helper.h b/include/haproxy/net_helper.h new file mode 100644 index 0000000..f019d30 --- /dev/null +++ b/include/haproxy/net_helper.h @@ -0,0 +1,387 @@ +/* + * include/haproxy/net_helper.h + * This file contains miscellaneous network helper functions. + * + * Copyright (C) 2017 Olivier Houchard + * Copyright (C) 2017-2020 Willy Tarreau + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _HAPROXY_NET_HELPER_H +#define _HAPROXY_NET_HELPER_H + +#include <arpa/inet.h> +#include <haproxy/api.h> +#include <haproxy/intops.h> + +/* Functions to read/write various integers that may be unaligned */ + +/* Read a uint16_t in native host order */ +static inline uint16_t read_u16(const void *p) +{ + const union { uint16_t u16; } __attribute__((packed))*u = p; + return u->u16; +} + +/* Write a uint16_t in native host order */ +static inline void write_u16(void *p, const uint16_t u16) +{ + union { uint16_t u16; } __attribute__((packed))*u = p; + u->u16 = u16; +} + +/* Read a uint32_t in native host order */ +static inline uint32_t read_u32(const void *p) +{ + const union { uint32_t u32; } __attribute__((packed))*u = p; + return u->u32; +} + +/* Write a uint32_t in native host order */ +static inline void write_u32(void *p, const uint32_t u32) +{ + union { uint32_t u32; } __attribute__((packed))*u = p; + u->u32 = u32; +} + +/* Read a uint64_t in native host order */ +static inline uint64_t read_u64(const void *p) +{ + const union { uint64_t u64; } __attribute__((packed))*u = p; + return u->u64; +} + +/* Write a uint64_t in native host order */ +static inline void write_u64(void *p, const uint64_t u64) +{ + union { uint64_t u64; } __attribute__((packed))*u = p; + u->u64 = u64; +} + +/* Read a void* in native host order */ +static inline void *read_ptr(const void *p) +{ + const union { void *ptr; } __attribute__((packed))*u = p; + return u->ptr; +} + +/* Write a void* in native host order */ +static inline void write_ptr(void *p, const void *ptr) +{ + if (sizeof(ptr) == 4) + return write_u32(p, (uintptr_t)ptr); + else + return write_u64(p, (uintptr_t)ptr); +} + +/* Read a possibly wrapping number of bytes <bytes> into destination <dst>. The + * first segment is composed of <s1> bytes at p1. The remaining byte(s), if any, + * are read from <p2>. <s1> may be zero and may also be larger than <bytes>. The + * caller is always responsible for providing enough bytes. Note: the function + * is purposely *not* marked inline to let the compiler decide what to do with + * it, because it's around 34 bytes long, placed on critical path but rarely + * called, and uses uses a lot of arguments if not inlined. The compiler will + * thus decide what's best to do with it depending on the context. + */ +static void readv_bytes(void *dst, const size_t bytes, const void *p1, size_t s1, const void *p2) +{ + size_t idx; + + p2 -= s1; + for (idx = 0; idx < bytes; idx++) { + if (idx == s1) + p1 = p2; + ((uint8_t *)dst)[idx] = ((const uint8_t *)p1)[idx]; + } + /* this memory barrier is critical otherwise gcc may over-optimize this + * code, completely removing it as well as any surrounding boundary + * check (4.7.1..6.4.0)! + */ + __asm__ volatile("" ::: "memory"); +} + +/* Write a possibly wrapping number of bytes <bytes> from location <src>. The + * first segment is composed of <s1> bytes at p1. The remaining byte(s), if any, + * are written to <p2>. <s1> may be zero and may also be larger than <bytes>. + * The caller is always responsible for providing enough room. Note: the + * function is purposely *not* marked inline to let the compiler decide what to + * do with it, because it's around 34 bytes long, placed on critical path but + * rarely called, and uses uses a lot of arguments if not inlined. The compiler + * will thus decide what's best to do with it depending on the context. + */ +static void writev_bytes(const void *src, const size_t bytes, void *p1, size_t s1, void *p2) +{ + size_t idx; + + p2 -= s1; + for (idx = 0; idx < bytes; idx++) { + if (idx == s1) + p1 = p2; + ((uint8_t *)p1)[idx] = ((const uint8_t *)src)[idx]; + } +} + +/* Read a possibly wrapping uint16_t in native host order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are read from + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough bytes. + */ +static inline uint16_t readv_u16(const void *p1, size_t s1, const void *p2) +{ + if (unlikely(s1 == 1)) { + volatile uint16_t u16; + + ((uint8_t *)&u16)[0] = *(uint8_t *)p1; + ((uint8_t *)&u16)[1] = *(uint8_t *)p2; + return u16; + } + else { + const union { uint16_t u16; } __attribute__((packed)) *u; + + u = (s1 == 0) ? p2 : p1; + return u->u16; + } +} + +/* Write a possibly wrapping uint16_t in native host order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are written to + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough room. + */ +static inline void writev_u16(void *p1, size_t s1, void *p2, const uint16_t u16) +{ + union { uint16_t u16; } __attribute__((packed)) *u; + + if (unlikely(s1 == 1)) { + *(uint8_t *)p1 = ((const uint8_t *)&u16)[0]; + *(uint8_t *)p2 = ((const uint8_t *)&u16)[1]; + } + else { + u = (s1 == 0) ? p2 : p1; + u->u16 = u16; + } +} + +/* Read a possibly wrapping uint32_t in native host order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are read from + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough bytes. + */ +static inline uint32_t readv_u32(const void *p1, size_t s1, const void *p2) +{ + uint32_t u32; + + if (likely(s1 >= sizeof(u32))) + u32 = read_u32(p1); + else + readv_bytes(&u32, sizeof(u32), p1, s1, p2); + return u32; +} + +/* Write a possibly wrapping uint32_t in native host order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are written to + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough room. + */ +static inline void writev_u32(void *p1, size_t s1, void *p2, const uint32_t u32) +{ + if (likely(s1 >= sizeof(u32))) + write_u32(p1, u32); + else + writev_bytes(&u32, sizeof(u32), p1, s1, p2); +} + +/* Read a possibly wrapping uint64_t in native host order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are read from + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough bytes. + */ +static inline uint64_t readv_u64(const void *p1, size_t s1, const void *p2) +{ + uint64_t u64; + + if (likely(s1 >= sizeof(u64))) + u64 = read_u64(p1); + else + readv_bytes(&u64, sizeof(u64), p1, s1, p2); + return u64; +} + +/* Write a possibly wrapping uint64_t in native host order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are written to + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough room. + */ +static inline void writev_u64(void *p1, size_t s1, void *p2, const uint64_t u64) +{ + if (likely(s1 >= sizeof(u64))) + write_u64(p1, u64); + else + writev_bytes(&u64, sizeof(u64), p1, s1, p2); +} + +/* Signed integer versions : return the same data but signed */ + +/* Read an int16_t in native host order */ +static inline int16_t read_i16(const void *p) +{ + return read_u16(p); +} + +/* Read an int32_t in native host order */ +static inline int32_t read_i32(const void *p) +{ + return read_u32(p); +} + +/* Read an int64_t in native host order */ +static inline int64_t read_i64(const void *p) +{ + return read_u64(p); +} + +/* Read a possibly wrapping int16_t in native host order */ +static inline int16_t readv_i16(const void *p1, size_t s1, const void *p2) +{ + return readv_u16(p1, s1, p2); +} + +/* Read a possibly wrapping int32_t in native host order */ +static inline int32_t readv_i32(const void *p1, size_t s1, const void *p2) +{ + return readv_u32(p1, s1, p2); +} + +/* Read a possibly wrapping int64_t in native host order */ +static inline int64_t readv_i64(const void *p1, size_t s1, const void *p2) +{ + return readv_u64(p1, s1, p2); +} + +/* Read a uint16_t, and convert from network order to host order */ +static inline uint16_t read_n16(const void *p) +{ + return ntohs(read_u16(p)); +} + +/* Write a uint16_t after converting it from host order to network order */ +static inline void write_n16(void *p, const uint16_t u16) +{ + write_u16(p, htons(u16)); +} + +/* Read a uint32_t, and convert from network order to host order */ +static inline uint32_t read_n32(const void *p) +{ + return ntohl(read_u32(p)); +} + +/* Write a uint32_t after converting it from host order to network order */ +static inline void write_n32(void *p, const uint32_t u32) +{ + write_u32(p, htonl(u32)); +} + +/* Read a uint64_t, and convert from network order to host order */ +static inline uint64_t read_n64(const void *p) +{ + return my_ntohll(read_u64(p)); +} + +/* Write a uint64_t after converting it from host order to network order */ +static inline void write_n64(void *p, const uint64_t u64) +{ + write_u64(p, my_htonll(u64)); +} + +/* Read a possibly wrapping uint16_t in network order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are read from + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough bytes. + */ +static inline uint16_t readv_n16(const void *p1, size_t s1, const void *p2) +{ + if (unlikely(s1 < 2)) { + if (s1 == 0) + p1 = p2++; + } + else + p2 = p1 + 1; + return (*(uint8_t *)p1 << 8) + *(uint8_t *)p2; +} + +/* Write a possibly wrapping uint16_t in network order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are written to + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough room. + */ +static inline void writev_n16(const void *p1, size_t s1, const void *p2, const uint16_t u16) +{ + if (unlikely(s1 < 2)) { + if (s1 == 0) + p1 = p2++; + } + else + p2 = p1 + 1; + *(uint8_t *)p1 = u16 >> 8; + *(uint8_t *)p2 = u16; +} + +/* Read a possibly wrapping uint32_t in network order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are read from + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough bytes. + */ +static inline uint32_t readv_n32(const void *p1, size_t s1, const void *p2) +{ + return ntohl(readv_u32(p1, s1, p2)); +} + +/* Write a possibly wrapping uint32_t in network order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are written to + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough room. + */ +static inline void writev_n32(void *p1, size_t s1, void *p2, const uint32_t u32) +{ + writev_u32(p1, s1, p2, htonl(u32)); +} + +/* Read a possibly wrapping uint64_t in network order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are read from + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough bytes. + */ +static inline uint64_t readv_n64(const void *p1, size_t s1, const void *p2) +{ + return my_ntohll(readv_u64(p1, s1, p2)); +} + +/* Write a possibly wrapping uint64_t in network order. The first segment is + * composed of <s1> bytes at p1. The remaining byte(s), if any, are written to + * <p2>. <s1> may be zero and may be larger than the type. The caller is always + * responsible for providing enough room. + */ +static inline void writev_n64(void *p1, size_t s1, void *p2, const uint64_t u64) +{ + writev_u64(p1, s1, p2, my_htonll(u64)); +} + +#endif /* HAPROXY_NET_HELPER_H */ diff --git a/include/haproxy/obj_type-t.h b/include/haproxy/obj_type-t.h new file mode 100644 index 0000000..517d230 --- /dev/null +++ b/include/haproxy/obj_type-t.h @@ -0,0 +1,56 @@ +/* + * include/haproxy/obj_type-t.h + * This file declares some object types for use in various structures. + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_OBJ_TYPE_T_H +#define _HAPROXY_OBJ_TYPE_T_H + +/* The principle is to be able to change the type of a pointer by pointing + * it directly to an object type. The object type indicates the format of the + * structure holing the type, and this is used to retrieve the pointer to the + * beginning of the structure. Doing so saves us from having to maintain both + * a pointer and a type for elements such as connections which can point to + * various types of objects. + */ + +/* object types : these ones take the same space as a char */ +enum obj_type { + OBJ_TYPE_NONE = 0, /* pointer is NULL by definition */ + OBJ_TYPE_LISTENER, /* object is a struct listener */ + OBJ_TYPE_PROXY, /* object is a struct proxy */ + OBJ_TYPE_SERVER, /* object is a struct server */ + OBJ_TYPE_APPLET, /* object is a struct applet */ + OBJ_TYPE_APPCTX, /* object is a struct appctx */ + OBJ_TYPE_CONN, /* object is a struct connection */ + OBJ_TYPE_SRVRQ, /* object is a struct dns_srvrq */ + OBJ_TYPE_SC, /* object is a struct stconn */ + OBJ_TYPE_STREAM, /* object is a struct stream */ + OBJ_TYPE_CHECK, /* object is a struct check */ + OBJ_TYPE_ENTRIES /* last one : number of entries */ +} __attribute__((packed)) ; + +#endif /* _HAPROXY_OBJ_TYPE_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/obj_type.h b/include/haproxy/obj_type.h new file mode 100644 index 0000000..1037460 --- /dev/null +++ b/include/haproxy/obj_type.h @@ -0,0 +1,213 @@ +/* + * include/haproxy/obj_type.h + * This file contains function prototypes to manipulate object types + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_OBJ_TYPE_H +#define _HAPROXY_OBJ_TYPE_H + +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/check-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/listener-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/pool.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> +#include <haproxy/stream-t.h> + +static inline enum obj_type obj_type(const enum obj_type *t) +{ + if (!t || *t >= OBJ_TYPE_ENTRIES) + return OBJ_TYPE_NONE; + return *t; +} + +static inline const char *obj_type_name(const enum obj_type *t) +{ + switch (obj_type(t)) { + case OBJ_TYPE_NONE: return "NONE"; + case OBJ_TYPE_LISTENER: return "LISTENER"; + case OBJ_TYPE_PROXY: return "PROXY"; + case OBJ_TYPE_SERVER: return "SERVER"; + case OBJ_TYPE_APPLET: return "APPLET"; + case OBJ_TYPE_APPCTX: return "APPCTX"; + case OBJ_TYPE_CONN: return "CONN"; + case OBJ_TYPE_SRVRQ: return "SRVRQ"; + case OBJ_TYPE_SC: return "SC"; + case OBJ_TYPE_STREAM: return "STREAM"; + case OBJ_TYPE_CHECK: return "CHECK"; + default: return "!INVAL!"; + } +} + +/* Note: for convenience, we provide two versions of each function : + * - __objt_<type> : converts the pointer without any control of its + * value nor type. + * - objt_<type> : same as above except that if the pointer is NULL + * or points to a non-matching type, NULL is returned instead. + */ + +static inline struct listener *__objt_listener(enum obj_type *t) +{ + return container_of(t, struct listener, obj_type); +} + +static inline struct listener *objt_listener(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_LISTENER) + return NULL; + return __objt_listener(t); +} + +static inline struct proxy *__objt_proxy(enum obj_type *t) +{ + return container_of(t, struct proxy, obj_type); +} + +static inline struct proxy *objt_proxy(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_PROXY) + return NULL; + return __objt_proxy(t); +} + +static inline struct server *__objt_server(enum obj_type *t) +{ + return container_of(t, struct server, obj_type); +} + +static inline struct server *objt_server(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_SERVER) + return NULL; + return __objt_server(t); +} + +static inline struct applet *__objt_applet(enum obj_type *t) +{ + return container_of(t, struct applet, obj_type); +} + +static inline struct applet *objt_applet(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_APPLET) + return NULL; + return __objt_applet(t); +} + +static inline struct appctx *__objt_appctx(enum obj_type *t) +{ + return container_of(t, struct appctx, obj_type); +} + +static inline struct appctx *objt_appctx(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_APPCTX) + return NULL; + return __objt_appctx(t); +} + +static inline struct stconn *__objt_sc(enum obj_type *t) +{ + return (container_of(t, struct stconn, obj_type)); +} + +static inline struct stconn *objt_sc(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_SC) + return NULL; + return __objt_sc(t); +} + +static inline struct connection *__objt_conn(enum obj_type *t) +{ + return container_of(t, struct connection, obj_type); +} + +static inline struct connection *objt_conn(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_CONN) + return NULL; + return __objt_conn(t); +} + +static inline struct resolv_srvrq *__objt_resolv_srvrq(enum obj_type *t) +{ + return container_of(t, struct resolv_srvrq, obj_type); +} + +static inline struct resolv_srvrq *objt_resolv_srvrq(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_SRVRQ) + return NULL; + return __objt_resolv_srvrq(t); +} + +static inline struct stream *__objt_stream(enum obj_type *t) +{ + return container_of(t, struct stream, obj_type); +} + +static inline struct stream *objt_stream(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_STREAM) + return NULL; + return __objt_stream(t); +} + +static inline struct check *__objt_check(enum obj_type *t) +{ + return container_of(t, struct check, obj_type); +} + +static inline struct check *objt_check(enum obj_type *t) +{ + if (!t || *t != OBJ_TYPE_CHECK) + return NULL; + return __objt_check(t); +} + +static inline void *obj_base_ptr(enum obj_type *t) +{ + switch (obj_type(t)) { + case OBJ_TYPE_NONE: return NULL; + case OBJ_TYPE_LISTENER: return __objt_listener(t); + case OBJ_TYPE_PROXY: return __objt_proxy(t); + case OBJ_TYPE_SERVER: return __objt_server(t); + case OBJ_TYPE_APPLET: return __objt_applet(t); + case OBJ_TYPE_APPCTX: return __objt_appctx(t); + case OBJ_TYPE_CONN: return __objt_conn(t); + case OBJ_TYPE_SRVRQ: return __objt_resolv_srvrq(t); + case OBJ_TYPE_SC: return __objt_sc(t); + case OBJ_TYPE_STREAM: return __objt_stream(t); + case OBJ_TYPE_CHECK: return __objt_check(t); + default: return t; // exact pointer for invalid case + } +} + +#endif /* _HAPROXY_OBJ_TYPE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/openssl-compat.h b/include/haproxy/openssl-compat.h new file mode 100644 index 0000000..5639468 --- /dev/null +++ b/include/haproxy/openssl-compat.h @@ -0,0 +1,487 @@ +#ifndef _HAPROXY_OPENSSL_COMPAT_H +#define _HAPROXY_OPENSSL_COMPAT_H +#ifdef USE_OPENSSL + +#ifdef USE_OPENSSL_WOLFSSL +#define TLSEXT_MAXLEN_host_name 255 +#include <wolfssl/options.h> +#endif + +#ifdef USE_OPENSSL_AWSLC +#include <openssl/base.h> +#if !defined(OPENSSL_IS_AWSLC) +#error "USE_OPENSSL_AWSLC is set but OPENSSL_IS_AWSLC is not defined, wrong header files detected" +#endif +#endif + +#include <openssl/bn.h> +#include <openssl/crypto.h> +#include <openssl/ssl.h> +#include <openssl/x509.h> +#include <openssl/x509v3.h> +#include <openssl/err.h> +#include <openssl/rand.h> +#include <openssl/hmac.h> +#include <openssl/rsa.h> +#if (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) +#include <openssl/ocsp.h> +#endif +#ifndef OPENSSL_NO_DH +#include <openssl/dh.h> +#endif +#if defined(USE_ENGINE) && !defined(OPENSSL_NO_ENGINE) +#include <openssl/engine.h> +#endif + +#ifdef SSL_MODE_ASYNC +#include <openssl/async.h> +#endif + +#if (OPENSSL_VERSION_NUMBER >= 0x3000000fL) +#include <openssl/core_names.h> +#include <openssl/decoder.h> +#include <openssl/param_build.h> +#include <openssl/provider.h> +#endif + +#ifdef USE_QUIC_OPENSSL_COMPAT +#include <haproxy/quic_openssl_compat.h> +#endif + +#if defined(LIBRESSL_VERSION_NUMBER) +/* LibreSSL is a fork of OpenSSL 1.0.1g but pretends to be 2.0.0, thus + * systematically breaking when some code is written for a specific version + * of OpenSSL. Let's make it appear like what it really is and deal with + * extra features with ORs and not with AND NOT. + */ +#define HA_OPENSSL_VERSION_NUMBER 0x1000107fL +#else /* this is for a real OpenSSL or a truly compatible derivative */ +#define HA_OPENSSL_VERSION_NUMBER OPENSSL_VERSION_NUMBER +#endif + +#ifndef OPENSSL_VERSION +#define OPENSSL_VERSION SSLEAY_VERSION +#define OpenSSL_version(x) SSLeay_version(x) +#define OpenSSL_version_num SSLeay +#endif + +#if (defined(LIBRESSL_VERSION_NUMBER) && LIBRESSL_VERSION_NUMBER >= 0x2070100fL) || defined(OPENSSL_IS_BORINGSSL) || (!defined(LIBRESSL_VERSION_NUMBER) && (OPENSSL_VERSION_NUMBER >= 0x10100000L)) +#define HAVE_SSL_EXTRACT_RANDOM +#endif + +#if ((OPENSSL_VERSION_NUMBER >= 0x10101000L) && !defined(OPENSSL_IS_BORINGSSL) && !defined(LIBRESSL_VERSION_NUMBER)) +#define HAVE_SSL_RAND_KEEP_RANDOM_DEVICES_OPEN +#endif + +#if ((OPENSSL_VERSION_NUMBER >= 0x10101000L) && !defined(LIBRESSL_VERSION_NUMBER) && !defined(OPENSSL_IS_BORINGSSL)) || defined(USE_OPENSSL_WOLFSSL) +#define HAVE_SSL_CTX_SET_CIPHERSUITES +#define HAVE_ASN1_TIME_TO_TM +#endif + +#if (defined(SSL_CLIENT_HELLO_CB) || defined(OPENSSL_IS_BORINGSSL)) +#define HAVE_SSL_CLIENT_HELLO_CB +#endif + +#if ((OPENSSL_VERSION_NUMBER >= 0x1000200fL) && !defined(OPENSSL_NO_TLSEXT) && !defined(LIBRESSL_VERSION_NUMBER) && !defined(OPENSSL_IS_BORINGSSL)) +#define HAVE_SSL_CTX_ADD_SERVER_CUSTOM_EXT +#endif + +#if ((OPENSSL_VERSION_NUMBER >= 0x10002000L) && !defined(LIBRESSL_VERSION_NUMBER)) +#define HAVE_SSL_CTX_get0_privatekey +#endif + +#if HA_OPENSSL_VERSION_NUMBER >= 0x1000104fL || defined(USE_OPENSSL_WOLFSSL) || defined(USE_OPENSSL_AWSLC) +/* CRYPTO_memcmp() is present since openssl 1.0.1d */ +#define HAVE_CRYPTO_memcmp +#endif + +#if (defined(SN_ct_cert_scts) && !defined(OPENSSL_NO_TLSEXT)) +#define HAVE_SSL_SCTL +#endif + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) || defined(USE_OPENSSL_AWSLC) || (defined(USE_OPENSSL_WOLFSSL) && defined(HAVE_SECRET_CALLBACK)) +#define HAVE_SSL_KEYLOG +#endif + +/* minimum OpenSSL 1.1.1 & libreSSL 3.3.6 */ +#if (defined(LIBRESSL_VERSION_NUMBER) && (LIBRESSL_VERSION_NUMBER >= 0x3030600L)) || (HA_OPENSSL_VERSION_NUMBER >= 0x10101000L) || defined(USE_OPENSSL_WOLFSSL) +#define HAVE_SSL_get0_verified_chain +#endif + + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL) +#define HAVE_OSSL_PARAM +#define MAC_CTX EVP_MAC_CTX +#define HASSL_DH EVP_PKEY +#define HASSL_DH_free EVP_PKEY_free +#define HASSL_DH_up_ref EVP_PKEY_up_ref + +#define HAVE_SSL_PROVIDERS + +#else /* HA_OPENSSL_VERSION_NUMBER >= 0x3000000fL */ +#define MAC_CTX HMAC_CTX +#define HASSL_DH DH +#define HASSL_DH_free DH_free +#define HASSL_DH_up_ref DH_up_ref +#endif + +#if ((HA_OPENSSL_VERSION_NUMBER < 0x1000000fL) && !defined(X509_get_X509_PUBKEY)) +#define X509_get_X509_PUBKEY(x) ((x)->cert_info->key) +#endif + + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1000100fL) +/* + * Functions introduced in OpenSSL 1.0.1 + */ +static inline int SSL_SESSION_set1_id_context(SSL_SESSION *s, const unsigned char *sid_ctx, unsigned int sid_ctx_len) +{ + s->sid_ctx_length = sid_ctx_len; + memcpy(s->sid_ctx, sid_ctx, sid_ctx_len); + return 1; +} +#endif + + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1000200fL) && (!defined(LIBRESSL_VERSION_NUMBER) || LIBRESSL_VERSION_NUMBER < 0x2070500fL) +/* introduced in openssl 1.0.2 */ + +static inline STACK_OF(X509) *X509_chain_up_ref(STACK_OF(X509) *chain) +{ + STACK_OF(X509) *ret; + int i; + + if ((ret = sk_X509_dup(chain)) == NULL) + return NULL; + for (i = 0; i < sk_X509_num(ret); i++) { + X509 *x = sk_X509_value(ret, i); + CRYPTO_add(&x->references, 1, CRYPTO_LOCK_X509); + } + return ret; +} + +#endif + +#ifdef OPENSSL_IS_BORINGSSL +/* + * Functions missing in BoringSSL + */ + +static inline X509_CRL *X509_OBJECT_get0_X509_CRL(const X509_OBJECT *a) +{ + if (a == NULL || a->type != X509_LU_CRL) { + return NULL; + } + return a->data.crl; +} +#endif + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) && (!defined(LIBRESSL_VERSION_NUMBER) || LIBRESSL_VERSION_NUMBER < 0x2070000fL) +/* + * Functions introduced in OpenSSL 1.1.0 and in LibreSSL 2.7.0 + */ + +static inline STACK_OF(X509_OBJECT) *X509_STORE_get0_objects(X509_STORE *st) +{ + return st->objs; +} + +static inline int X509_OBJECT_get_type(const X509_OBJECT *a) +{ + return a->type; +} + +static inline X509 *X509_OBJECT_get0_X509(const X509_OBJECT *a) +{ + if (a == NULL || a->type != X509_LU_X509) { + return NULL; + } + return a->data.x509; +} + +static inline X509_CRL *X509_OBJECT_get0_X509_CRL(const X509_OBJECT *a) +{ + if (a == NULL || a->type != X509_LU_CRL) { + return NULL; + } + return a->data.crl; +} + +static inline int SSL_SESSION_set1_id(SSL_SESSION *s, const unsigned char *sid, unsigned int sid_len) +{ + s->session_id_length = sid_len; + memcpy(s->session_id, sid, sid_len); + return 1; +} + +static inline X509_ALGOR *X509_get0_tbs_sigalg(const X509 *x) +{ + return x->cert_info->signature; +} + +#if (!defined OPENSSL_NO_OCSP) +static inline const OCSP_CERTID *OCSP_SINGLERESP_get0_id(const OCSP_SINGLERESP *single) +{ + return single->certId; +} +#endif + +#ifndef OPENSSL_NO_DH +static inline int DH_set0_pqg(DH *dh, BIGNUM *p, BIGNUM *q, BIGNUM *g) +{ + /* Implements only the bare necessities for HAProxy */ + dh->p = p; + dh->g = g; + return 1; +} +#endif + +static inline const unsigned char *ASN1_STRING_get0_data(const ASN1_STRING *x) +{ + return x->data; +} + +static inline void X509_up_ref(X509 *x) +{ + CRYPTO_add(&x->references, 1, CRYPTO_LOCK_X509); +} + +static inline void EVP_PKEY_up_ref(EVP_PKEY *pkey) +{ + CRYPTO_add(&pkey->references, 1, CRYPTO_LOCK_EVP_PKEY); +} + +static inline void SSL_CTX_up_ref(SSL_CTX *ctx) +{ + CRYPTO_add(&ctx->references, 1, CRYPTO_LOCK_SSL_CTX); +} + +static inline int X509_CRL_get_signature_nid(const X509_CRL *crl) +{ + return OBJ_obj2nid(crl->sig_alg->algorithm); +} + +static inline const ASN1_TIME *X509_CRL_get0_lastUpdate(const X509_CRL *crl) +{ + return X509_CRL_get_lastUpdate(crl); +} + +static inline const ASN1_TIME *X509_CRL_get0_nextUpdate(const X509_CRL *crl) +{ + return X509_CRL_get_nextUpdate(crl); +} + +static inline const ASN1_INTEGER *X509_REVOKED_get0_serialNumber(const X509_REVOKED *x) +{ + return x->serialNumber; +} + +static inline const ASN1_TIME *X509_REVOKED_get0_revocationDate(const X509_REVOKED *x) +{ + return x->revocationDate; +} + +static inline X509 *X509_STORE_CTX_get0_cert(X509_STORE_CTX *ctx) +{ + return ctx->cert; +} + +static inline int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) +{ + if (r == NULL || s == NULL) + return 0; + BN_clear_free(sig->r); + BN_clear_free(sig->s); + + sig->r = r; + sig->s = s; + return 1; +} + +#endif + +#if (HA_OPENSSL_VERSION_NUMBER < 0x3000000fL) +#if defined(SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB) +#define SSL_CTX_set_tlsext_ticket_key_evp_cb SSL_CTX_set_tlsext_ticket_key_cb +#endif + +/* + * Functions introduced in OpenSSL 3.0.0 + */ +static inline unsigned long ERR_peek_error_func(const char **func) +{ + unsigned long ret = ERR_peek_error(); + if (ret == 0) + return ret; + + if (func) + *func = ERR_func_error_string(ret); + + return ret; +} + +#endif + +#if (HA_OPENSSL_VERSION_NUMBER >= 0x1010000fL) || (defined(LIBRESSL_VERSION_NUMBER) && LIBRESSL_VERSION_NUMBER >= 0x2070200fL) +#define __OPENSSL_110_CONST__ const +#else +#define __OPENSSL_110_CONST__ +#endif + +/* ERR_remove_state() was deprecated in 1.0.0 in favor of + * ERR_remove_thread_state(), which was in turn deprecated in + * 1.1.0 and does nothing anymore. Let's simply silently kill + * it. + */ +#if (HA_OPENSSL_VERSION_NUMBER >= 0x1010000fL) +#undef ERR_remove_state +#define ERR_remove_state(x) +#endif + + +/* RAND_pseudo_bytes() is deprecated in 1.1.0 in favor of RAND_bytes(). Note + * that the return codes differ, but it happens that the only use case (ticket + * key update) was already wrong, considering a non-cryptographic random as a + * failure. + */ +#if (HA_OPENSSL_VERSION_NUMBER >= 0x1010000fL) +#undef RAND_pseudo_bytes +#define RAND_pseudo_bytes(x,y) RAND_bytes(x,y) +#endif + + +/* Signature from RFC 5246, missing in openssl < 1.0.1 */ +#ifndef TLSEXT_signature_anonymous +#define TLSEXT_signature_anonymous 0 +#define TLSEXT_signature_rsa 1 +#define TLSEXT_signature_dsa 2 +#define TLSEXT_signature_ecdsa 3 +#endif + +#if ((HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) && (!defined(LIBRESSL_VERSION_NUMBER) || LIBRESSL_VERSION_NUMBER < 0x2070000fL)) ||\ + defined(OPENSSL_IS_BORINGSSL) +#define X509_getm_notBefore X509_get_notBefore +#define X509_getm_notAfter X509_get_notAfter +#endif + +#if !defined(EVP_CTRL_AEAD_SET_IVLEN) +#define EVP_CTRL_AEAD_SET_IVLEN EVP_CTRL_GCM_SET_IVLEN +#endif + +#if !defined(EVP_CTRL_AEAD_SET_TAG) +#define EVP_CTRL_AEAD_SET_TAG EVP_CTRL_GCM_SET_TAG +#endif + +/* Supported hash function for TLS tickets */ +#ifdef OPENSSL_NO_SHA256 +#define TLS_TICKET_HASH_FUNCT EVP_sha1 +#else +#define TLS_TICKET_HASH_FUNCT EVP_sha256 +#endif /* OPENSSL_NO_SHA256 */ + +#ifndef SSL_OP_CIPHER_SERVER_PREFERENCE /* needs OpenSSL >= 0.9.7 */ +#define SSL_OP_CIPHER_SERVER_PREFERENCE 0 +#endif + +#ifndef SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION /* needs OpenSSL >= 0.9.7 */ +#define SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION 0 +#define SSL_renegotiate_pending(arg) 0 +#endif + +#ifndef SSL_OP_SINGLE_ECDH_USE /* needs OpenSSL >= 0.9.8 */ +#define SSL_OP_SINGLE_ECDH_USE 0 +#endif + +#ifndef SSL_OP_NO_TICKET /* needs OpenSSL >= 0.9.8 */ +#define SSL_OP_NO_TICKET 0 +#endif + +#ifndef SSL_OP_NO_COMPRESSION /* needs OpenSSL >= 0.9.9 */ +#define SSL_OP_NO_COMPRESSION 0 +#endif + +#ifdef OPENSSL_NO_SSL3 /* SSLv3 support removed */ +#undef SSL_OP_NO_SSLv3 +#define SSL_OP_NO_SSLv3 0 +#endif + +#ifndef SSL_OP_NO_TLSv1_1 /* needs OpenSSL >= 1.0.1 */ +#define SSL_OP_NO_TLSv1_1 0 +#endif + +#ifndef SSL_OP_NO_TLSv1_2 /* needs OpenSSL >= 1.0.1 */ +#define SSL_OP_NO_TLSv1_2 0 +#endif + +#ifndef SSL_OP_NO_TLSv1_3 /* needs OpenSSL >= 1.1.1 */ +#define SSL_OP_NO_TLSv1_3 0 +#endif + +#ifndef SSL_OP_SINGLE_DH_USE /* needs OpenSSL >= 0.9.6 */ +#define SSL_OP_SINGLE_DH_USE 0 +#endif + +#ifndef SSL_OP_SINGLE_ECDH_USE /* needs OpenSSL >= 1.0.0 */ +#define SSL_OP_SINGLE_ECDH_USE 0 +#endif + +#ifndef SSL_MODE_RELEASE_BUFFERS /* needs OpenSSL >= 1.0.0 */ +#define SSL_MODE_RELEASE_BUFFERS 0 +#endif + +#ifndef SSL_MODE_SMALL_BUFFERS /* needs small_records.patch */ +#define SSL_MODE_SMALL_BUFFERS 0 +#endif + +#ifndef SSL_OP_PRIORITIZE_CHACHA /* needs OpenSSL >= 1.1.1 */ +#define SSL_OP_PRIORITIZE_CHACHA 0 +#endif + +#ifndef SSL_CTRL_GET_EXTRA_CHAIN_CERTS +#define SSL_CTX_get_extra_chain_certs(ctx, chain) do { *(chain) = (ctx)->extra_certs; } while (0) +#endif + +#if HA_OPENSSL_VERSION_NUMBER < 0x10100000L && (!defined(LIBRESSL_VERSION_NUMBER) || LIBRESSL_VERSION_NUMBER < 0x2070000fL) +#define BIO_get_data(b) (b)->ptr +#define BIO_set_data(b, v) do { (b)->ptr = (v); } while (0) +#define BIO_set_init(b, v) do { (b)->init = (v); } while (0) + +#define BIO_meth_free(m) free(m) +#define BIO_meth_new(type, name) calloc(1, sizeof(BIO_METHOD)) +#define BIO_meth_set_gets(m, f) do { (m)->bgets = (f); } while (0) +#define BIO_meth_set_puts(m, f) do { (m)->bputs = (f); } while (0) +#define BIO_meth_set_read(m, f) do { (m)->bread = (f); } while (0) +#define BIO_meth_set_write(m, f) do { (m)->bwrite = (f); } while (0) +#define BIO_meth_set_create(m, f) do { (m)->create = (f); } while (0) +#define BIO_meth_set_ctrl(m, f) do { (m)->ctrl = (f); } while (0) +#define BIO_meth_set_destroy(m, f) do { (m)->destroy = (f); } while (0) +#endif + +#ifndef SSL_CTX_set_ecdh_auto +#define SSL_CTX_set_ecdh_auto(dummy, onoff) ((onoff) != 0) +#endif + +/* The EVP_MD_CTX_create() and EVP_MD_CTX_destroy() functions were renamed to + * EVP_MD_CTX_new() and EVP_MD_CTX_free() in OpenSSL 1.1.0, respectively. + */ +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) +#define EVP_MD_CTX_new EVP_MD_CTX_create +#define EVP_MD_CTX_free EVP_MD_CTX_destroy +#endif + +/* OpenSSL 1.0.2 and onwards define SSL_CTX_set1_curves_list which is both a + * function and a macro. OpenSSL 1.0.2 to 1.1.0 define SSL_CTRL_SET_CURVES_LIST + * as a macro, which disappeared from 1.1.1. BoringSSL only has that one and + * not the former macro but it does have the function. Let's keep the test on + * the macro matching the function name. + */ +#if !defined(SSL_CTX_set1_curves_list) && defined(SSL_CTRL_SET_CURVES_LIST) +#define SSL_CTX_set1_curves_list SSL_CTX_set1_curves_list +#endif + +#if !defined(SSL_CTX_set1_sigalgs_list) && defined(SSL_CTRL_SET_SIGALGS_LIST) +#define SSL_CTX_set1_sigalgs_list SSL_CTX_set1_sigalgs_list +#endif + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_OPENSSL_COMPAT_H */ diff --git a/include/haproxy/pattern-t.h b/include/haproxy/pattern-t.h new file mode 100644 index 0000000..6c1ba24 --- /dev/null +++ b/include/haproxy/pattern-t.h @@ -0,0 +1,235 @@ +/* + * include/haproxy/pattern-t.h + * This file provides structures and types for ACLs. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PATTERN_T_H +#define _HAPROXY_PATTERN_T_H + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/regex-t.h> +#include <haproxy/sample_data-t.h> +#include <haproxy/thread-t.h> + + +/* Pattern matching function result. + * + * We're using a 3-state matching system to match samples against patterns in + * ACLs : + * - PASS : at least one pattern already matches + * - MISS : some data is missing to decide if some rules may finally match. + * - FAIL : no pattern may ever match + * + * We assign values 0, 1 and 3 to FAIL, MISS and PASS respectively, so that we + * can make use of standard arithmetic for the truth tables below : + * + * x | !x x&y | F(0) | M(1) | P(3) x|y | F(0) | M(1) | P(3) + * ------+----- -----+------+------+----- -----+------+------+----- + * F(0) | P(3) F(0)| F(0) | F(0) | F(0) F(0)| F(0) | M(1) | P(3) + * M(1) | M(1) M(1)| F(0) | M(1) | M(1) M(1)| M(1) | M(1) | P(3) + * P(3) | F(0) P(3)| F(0) | M(1) | P(3) P(3)| P(3) | P(3) | P(3) + * + * neg(x) = (3 >> x) and(x,y) = (x & y) or(x,y) = (x | y) + * + * For efficiency, the ACL return flags are directly mapped from the pattern + * match flags. A pattern can't return "MISS" since it's always presented an + * existing sample. So that leaves us with only two possible values : + * MATCH = 0 + * NOMATCH = 3 + */ +enum pat_match_res { + PAT_NOMATCH = 0, /* sample didn't match any pattern */ + PAT_MATCH = 3, /* sample matched at least one pattern */ +}; + +/* possible flags for patterns matching or parsing */ +enum { + PAT_MF_IGNORE_CASE = 1 << 0, /* ignore case */ + PAT_MF_NO_DNS = 1 << 1, /* don't perform any DNS requests */ +}; + +/* possible flags for patterns storage */ +enum { + PAT_SF_TREE = 1 << 0, /* some patterns are arranged in a tree */ + PAT_SF_REGFREE = 1 << 1, /* run regex_free() on the pointer */ +}; + +/* ACL match methods */ +enum { + PAT_MATCH_FOUND, /* just ensure that fetch found the sample */ + PAT_MATCH_BOOL, /* match fetch's integer value as boolean */ + PAT_MATCH_INT, /* unsigned integer (int) */ + PAT_MATCH_IP, /* IPv4/IPv6 address (IP) */ + PAT_MATCH_BIN, /* hex string (bin) */ + PAT_MATCH_LEN, /* string length (str -> int) */ + PAT_MATCH_STR, /* exact string match (str) */ + PAT_MATCH_BEG, /* beginning of string (str) */ + PAT_MATCH_SUB, /* substring (str) */ + PAT_MATCH_DIR, /* directory-like sub-string (str) */ + PAT_MATCH_DOM, /* domain-like sub-string (str) */ + PAT_MATCH_END, /* end of string (str) */ + PAT_MATCH_REG, /* regex (str -> reg) */ + PAT_MATCH_REGM, /* regex (str -> reg) with match zones */ + /* keep this one last */ + PAT_MATCH_NUM +}; + +#define PAT_REF_MAP 0x1 /* Set if the reference is used by at least one map. */ +#define PAT_REF_ACL 0x2 /* Set if the reference is used by at least one acl. */ +#define PAT_REF_SMP 0x4 /* Flag used if the reference contains a sample. */ + +/* This struct contain a list of reference strings for dunamically + * updatable patterns. + */ +struct pat_ref { + struct list list; /* Used to chain refs. */ + char *reference; /* The reference name. */ + char *display; /* String displayed to identify the pattern origin. */ + struct list head; /* The head of the list of struct pat_ref_elt. */ + struct eb_root ebmb_root; /* The tree where pattern reference elements are attached. */ + struct list pat; /* The head of the list of struct pattern_expr. */ + unsigned int flags; /* flags PAT_REF_*. */ + unsigned int curr_gen; /* current generation number (anything below can be removed) */ + unsigned int next_gen; /* next generation number (insertions use this one) */ + int unique_id; /* Each pattern reference have unique id. */ + unsigned long long revision; /* updated for each update */ + unsigned long long entry_cnt; /* the total number of entries */ + THREAD_ALIGN(64); + __decl_thread(HA_RWLOCK_T lock); /* Lock used to protect pat ref elements */ +}; + +/* This is a part of struct pat_ref. Each entry contains one pattern and one + * associated value as original string. All derivative forms (via exprs) are + * accessed from list_head or tree_head. Be careful, it's variable-sized! + */ +struct pat_ref_elt { + struct list list; /* Used to chain elements. */ + struct list back_refs; /* list of users tracking this pat ref */ + void *list_head; /* all &pattern_list->from_ref derived from this reference, ends with NULL */ + void *tree_head; /* all &pattern_tree->from_ref derived from this reference, ends with NULL */ + char *sample; + unsigned int gen_id; /* generation of pat_ref this was made for */ + int line; + struct ebmb_node node; /* Node to attach this element to its <pat_ref> ebtree. */ + const char pattern[0]; // const only to make sure nobody tries to free it. +}; + +/* This contain each tree indexed entry. This struct permit to associate + * "sample" with a tree entry. It is used with maps. + */ +struct pattern_tree { + void *from_ref; // pattern_tree linked from pat_ref_elt, ends with NULL + struct sample_data *data; + struct pat_ref_elt *ref; + struct pattern_expr *expr; + struct ebmb_node node; +}; + +/* This describes one ACL pattern, which might be a single value or a tree of + * values. All patterns for a single ACL expression are linked together. Some + * of them might have a type (eg: IP). Right now, the types are shared with + * the samples, though it is possible that in the future this will change to + * accommodate for other types (eg: meth, regex). Unsigned and constant types + * are preferred when there is a doubt. + */ +struct pattern { + int type; /* type of the ACL pattern (SMP_T_*) */ + union { + int i; /* integer value */ + struct { + signed long long min, max; + unsigned int min_set:1; + unsigned int max_set:1; + } range; /* integer range */ + struct { + struct in_addr addr; + struct in_addr mask; + } ipv4; /* IPv4 address */ + struct { + struct in6_addr addr; + unsigned char mask; /* number of bits */ + } ipv6; /* IPv6 address/mask */ + } val; /* direct value */ + union { + void *ptr; /* any data */ + char *str; /* any string */ + struct my_regex *reg; /* a compiled regex */ + } ptr; /* indirect values, allocated or NULL */ + int len; /* data length when required */ + int sflags; /* flags relative to the storage method. */ + struct sample_data *data; /* used to store a pointer to sample value associated + with the match. It is used with maps */ + struct pat_ref_elt *ref; +}; + +/* This struct is just used for chaining patterns */ +struct pattern_list { + void *from_ref; // pattern_tree linked from pat_ref_elt, ends with NULL + struct list list; + struct pattern pat; + struct pattern_expr *expr; +}; + +/* Description of a pattern expression. + * It contains pointers to the parse and match functions, and a list or tree of + * patterns to test against. The structure is organized so that the hot parts + * are grouped together in order to optimize caching. + */ +struct pattern_expr { + struct list list; /* Used for chaining pattern_expr in pat_ref. */ + struct pat_ref *ref; /* The pattern reference if exists. */ + struct pattern_head *pat_head; /* Point to the pattern_head that contain manipulation functions. + * Note that this link point on compatible head but not on the real + * head. You can use only the function, and you must not use the + * "head". Don't write "(struct pattern_expr *)any->pat_head->expr". + */ + struct list patterns; /* list of acl_patterns */ + struct eb_root pattern_tree; /* may be used for lookup in large datasets */ + struct eb_root pattern_tree_2; /* may be used for different types */ + int mflags; /* flags relative to the parsing or matching method. */ + __decl_thread(HA_RWLOCK_T lock); /* lock used to protect patterns */ +}; + +/* This is a list of expression. A struct pattern_expr can be used by + * more than one "struct pattern_head". this intermediate struct + * permit more than one list. + */ +struct pattern_expr_list { + struct list list; /* Used for chaining pattern_expr in pattern_head. */ + int do_free; + struct pattern_expr *expr; /* The used expr. */ +}; + + +/* This struct contains a list of pattern expr */ +struct sample; +struct pattern_head { + int (*parse)(const char *text, struct pattern *pattern, int flags, char **err); + int (*parse_smp)(const char *text, struct sample_data *data); + int (*index)(struct pattern_expr *, struct pattern *, char **); + void (*prune)(struct pattern_expr *); + struct pattern *(*match)(struct sample *, struct pattern_expr *, int); + int expect_type; /* type of the expected sample (SMP_T_*) */ + + struct list head; /* This is a list of struct pattern_expr_list. */ +}; + +#endif /* _HAPROXY_PATTERN_T_H */ diff --git a/include/haproxy/pattern.h b/include/haproxy/pattern.h new file mode 100644 index 0000000..49e5ad2 --- /dev/null +++ b/include/haproxy/pattern.h @@ -0,0 +1,273 @@ +/* + * include/haproxy/pattern.h + * This file provides structures and types for pattern matching. + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PATTERN_H +#define _HAPROXY_PATTERN_H + +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/pattern-t.h> +#include <haproxy/sample-t.h> + +/* pattern management function arrays */ +extern const char *const pat_match_names[PAT_MATCH_NUM]; +extern int const pat_match_types[PAT_MATCH_NUM]; + +extern int (*const pat_parse_fcts[PAT_MATCH_NUM])(const char *, struct pattern *, int, char **); +extern int (*const pat_index_fcts[PAT_MATCH_NUM])(struct pattern_expr *, struct pattern *, char **); +extern void (*const pat_prune_fcts[PAT_MATCH_NUM])(struct pattern_expr *); +extern struct pattern *(*const pat_match_fcts[PAT_MATCH_NUM])(struct sample *, struct pattern_expr *, int); + +/* This is the root of the list of all pattern_ref avalaibles. */ +extern struct list pattern_reference; + +int pattern_finalize_config(void); + +/* return the PAT_MATCH_* index for match name "name", or < 0 if not found */ +static inline int pat_find_match_name(const char *name) +{ + int i; + + for (i = 0; i < PAT_MATCH_NUM; i++) + if (strcmp(name, pat_match_names[i]) == 0) + return i; + return -1; +} + +/* This function executes a pattern match on a sample. It applies pattern <expr> + * to sample <smp>. The function returns NULL if the sample don't match. It returns + * non-null if the sample match. If <fill> is true and the sample match, the + * function returns the matched pattern. In many cases, this pattern can be a + * static buffer. + */ +struct pattern *pattern_exec_match(struct pattern_head *head, struct sample *smp, int fill); + +/* + * + * The following function gets "pattern", duplicate it and index it in "expr" + * + */ +int pat_idx_list_val(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_list_ptr(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_list_str(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_list_reg(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_list_regm(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_tree_ip(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_tree_str(struct pattern_expr *expr, struct pattern *pat, char **err); +int pat_idx_tree_pfx(struct pattern_expr *expr, struct pattern *pat, char **err); + +/* + * + * The following function deletes all patterns related to reference pattern + * element <elt> in pattern reference <ref>. + * + */ +void pat_delete_gen(struct pat_ref *ref, struct pat_ref_elt *elt); + +/* + * + * The following functions clean all entries of a pattern expression and + * reset the tree and list root. + * + */ +void pat_prune_gen(struct pattern_expr *expr); + +/* + * + * The following functions are general purpose pattern matching functions. + * + */ + + +/* ignore the current line */ +int pat_parse_nothing(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse an integer. It is put both in min and max. */ +int pat_parse_int(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse an version. It is put both in min and max. */ +int pat_parse_dotted_ver(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse a range of integers delimited by either ':' or '-'. If only one + * integer is read, it is set as both min and max. + */ +int pat_parse_range(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse a string. It is allocated and duplicated. */ +int pat_parse_str(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse a hexa binary definition. It is allocated and duplicated. */ +int pat_parse_bin(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse a regex. It is allocated. */ +int pat_parse_reg(const char *text, struct pattern *pattern, int mflags, char **err); + +/* Parse an IP address and an optional mask in the form addr[/mask]. + * The addr may either be an IPv4 or IPv6 address, or a hostname that resolves + * to a valid IPv4 address. The mask can be provided as a number of bits, or + * even as a dotted mask (but the latter only works for IPv4 addresses). + * Returns 1 if OK, otherwise 0. + */ +int pat_parse_ip(const char *text, struct pattern *pattern, int mflags, char **err); + +/* NB: For two strings to be identical, it is required that their lengths match */ +struct pattern *pat_match_str(struct sample *smp, struct pattern_expr *expr, int fill); + +/* NB: For two binary buffers to be identical, it is required that their lengths match */ +struct pattern *pat_match_bin(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the length of the pattern in <test> is included between min and max */ +struct pattern *pat_match_len(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the integer in <test> is included between min and max */ +struct pattern *pat_match_int(struct sample *smp, struct pattern_expr *expr, int fill); + +/* always return false */ +struct pattern *pat_match_nothing(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the pattern matches the end of the tested string. */ +struct pattern *pat_match_end(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the pattern matches the beginning of the tested string. */ +struct pattern *pat_match_beg(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the pattern is included inside the tested string. */ +struct pattern *pat_match_sub(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the pattern is included inside the tested string, but enclosed + * between slashes or at the beginning or end of the string. Slashes at the + * beginning or end of the pattern are ignored. + */ +struct pattern *pat_match_dir(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Checks that the pattern is included inside the tested string, but enclosed + * between dots or at the beginning or end of the string. Dots at the beginning + * or end of the pattern are ignored. + */ +struct pattern *pat_match_dom(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Check that the input IP address (IPv4 or IPv6) in <smp> matches the IP/mask + * in pattern + */ +struct pattern *pat_match_ip(struct sample *smp, struct pattern_expr *expr, int fill); + +/* Executes a regex. It temporarily changes the data to add a trailing zero, + * and restores the previous character when leaving. + */ +struct pattern *pat_match_reg(struct sample *smp, struct pattern_expr *expr, int fill); +struct pattern *pat_match_regm(struct sample *smp, struct pattern_expr *expr, int fill); + +/* + * pattern_ref manipulation. + */ +struct pat_ref *pat_ref_lookup(const char *reference); +struct pat_ref *pat_ref_lookupid(int unique_id); +struct pat_ref *pat_ref_new(const char *reference, const char *display, unsigned int flags); +struct pat_ref *pat_ref_newid(int unique_id, const char *display, unsigned int flags); +struct pat_ref_elt *pat_ref_find_elt(struct pat_ref *ref, const char *key); +struct pat_ref_elt *pat_ref_append(struct pat_ref *ref, const char *pattern, const char *sample, int line); +struct pat_ref_elt *pat_ref_load(struct pat_ref *ref, unsigned int gen, const char *pattern, const char *sample, int line, char **err); +int pat_ref_push(struct pat_ref_elt *elt, struct pattern_expr *expr, int patflags, char **err); +int pat_ref_add(struct pat_ref *ref, const char *pattern, const char *sample, char **err); +int pat_ref_set(struct pat_ref *ref, const char *pattern, const char *sample, char **err, struct pat_ref_elt *elt); +int pat_ref_set_by_id(struct pat_ref *ref, struct pat_ref_elt *refelt, const char *value, char **err); +int pat_ref_delete(struct pat_ref *ref, const char *key); +void pat_ref_delete_by_ptr(struct pat_ref *ref, struct pat_ref_elt *elt); +int pat_ref_delete_by_id(struct pat_ref *ref, struct pat_ref_elt *refelt); +int pat_ref_prune(struct pat_ref *ref); +int pat_ref_commit_elt(struct pat_ref *ref, struct pat_ref_elt *elt, char **err); +int pat_ref_purge_range(struct pat_ref *ref, uint from, uint to, int budget); + +/* Create a new generation number for next pattern updates and returns it. This + * must be used to atomically insert new patterns that will atomically replace + * all current ones on commit. Generation numbers start at zero and are only + * incremented and wrap at 2^32. There must not be more than 2^31-1 called + * without a commit. The new reserved number is returned. Locking is not + * necessary. + */ +static inline unsigned int pat_ref_newgen(struct pat_ref *ref) +{ + return HA_ATOMIC_ADD_FETCH(&ref->next_gen, 1); +} + +/* Give up a previously assigned generation number. By doing this the caller + * certifies that no element was inserted using this number, and that this + * number might safely be reused if none was assigned since. This is convenient + * to avoid wasting numbers in case an operation couldn't be started right + * after a call to pat_ref_newgen(), but it is absolutely not necessary. The + * main use case is to politely abandon an update attempt upon error just after + * having received a number (e.g. attempting to retrieve entries from the + * network, and failed to establish a connection). This is done atomically so + * no locking is necessary. + */ +static inline void pat_ref_giveup(struct pat_ref *ref, unsigned int gen) +{ + HA_ATOMIC_CAS(&ref->next_gen, &gen, gen - 1); +} + +/* Commit the whole pattern reference by updating the generation number or + * failing in case someone else managed to do it meanwhile. While this could + * be done using a CAS, it must instead be called with the PATREF_LOCK held in + * order to guarantee the consistency of the generation number for all other + * functions that rely on it. It returns zero on success, non-zero on failure + * (technically speaking it returns the difference between the attempted + * generation and the effective one, so that it can be used for reporting). + */ +static inline int pat_ref_commit(struct pat_ref *ref, unsigned int gen) +{ + if ((int)(gen - ref->curr_gen) > 0) + ref->curr_gen = gen; + return gen - ref->curr_gen; +} + +/* This function purges all elements from <ref> that are older than generation + * <oldest>. It will not purge more than <budget> entries at once, in order to + * remain responsive. If budget is negative, no limit is applied. + * The caller must already hold the PATREF_LOCK on <ref>. The function will + * take the PATEXP_LOCK on all expressions of the pattern as needed. It returns + * non-zero on completion, or zero if it had to stop before the end after + * <budget> was depleted. + */ +static inline int pat_ref_purge_older(struct pat_ref *ref, uint oldest, int budget) +{ + return pat_ref_purge_range(ref, oldest + 1, oldest - 1, budget); +} + + +/* + * pattern_head manipulation. + */ +void pattern_init_head(struct pattern_head *head); +void pattern_prune(struct pattern_head *head); +int pattern_read_from_file(struct pattern_head *head, unsigned int refflags, const char *filename, int patflags, int load_smp, char **err, const char *file, int line); + +/* + * pattern_expr manipulation. + */ +void pattern_init_expr(struct pattern_expr *expr); +struct pattern_expr *pattern_lookup_expr(struct pattern_head *head, struct pat_ref *ref); +struct pattern_expr *pattern_new_expr(struct pattern_head *head, struct pat_ref *ref, + int patflags, char **err, int *reuse); +struct sample_data **pattern_find_smp(struct pattern_expr *expr, struct pat_ref_elt *elt); + + +#endif diff --git a/include/haproxy/payload.h b/include/haproxy/payload.h new file mode 100644 index 0000000..f91817a --- /dev/null +++ b/include/haproxy/payload.h @@ -0,0 +1,39 @@ +/* + * include/haproxy/payload.h + * Definitions for payload-based sample fetches and ACLs + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PAYLOAD_H +#define _HAPROXY_PAYLOAD_H + +#include <haproxy/api.h> +#include <haproxy/sample-t.h> +#include <haproxy/stream-t.h> + +int fetch_rdp_cookie_name(struct stream *s, struct sample *smp, const char *cname, int clen); +int val_payload_lv(struct arg *arg, char **err_msg); + +#endif /* _HAPROXY_PAYLOAD_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/peers-t.h b/include/haproxy/peers-t.h new file mode 100644 index 0000000..124fac3 --- /dev/null +++ b/include/haproxy/peers-t.h @@ -0,0 +1,160 @@ +/* + * include/haproxy/peers-t.h + * This file defines everything related to peers. + * + * Copyright 2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PEERS_T_H +#define _HAPROXY_PEERS_T_H + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/dict-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/thread-t.h> + + +struct shared_table { + struct stktable *table; /* stick table to sync */ + int local_id; + int remote_id; + int flags; + uint64_t remote_data; + unsigned int remote_data_nbelem[STKTABLE_DATA_TYPES]; + unsigned int last_acked; + unsigned int last_pushed; + unsigned int last_get; + unsigned int teaching_origin; + unsigned int update; + struct shared_table *next; /* next shared table in list */ +}; + +struct peer { + int local; /* proxy state */ + __decl_thread(HA_SPINLOCK_T lock); /* lock used to handle this peer section */ + char *id; + struct { + const char *file; /* file where the section appears */ + int line; /* line where the section appears */ + } conf; /* config information */ + time_t last_change; + struct sockaddr_storage addr; /* peer address */ + struct protocol *proto; /* peer address protocol */ + struct xprt_ops *xprt; /* peer socket operations at transport layer */ + void *sock_init_arg; /* socket operations's opaque init argument if needed */ + unsigned int flags; /* peer session flags */ + unsigned int statuscode; /* current/last session status code */ + unsigned int reconnect; /* next connect timer */ + unsigned int heartbeat; /* next heartbeat timer */ + unsigned int confirm; /* confirm message counter */ + unsigned int last_hdshk; /* Date of the last handshake. */ + uint32_t rx_hbt; /* received heartbeats counter */ + uint32_t tx_hbt; /* transmitted heartbeats counter */ + uint32_t no_hbt; /* no received heartbeat counter */ + uint32_t new_conn; /* new connection after reconnection timeout expiration counter */ + uint32_t proto_err; /* protocol errors counter */ + uint32_t coll; /* connection collisions counter */ + struct appctx *appctx; /* the appctx running it */ + struct shared_table *remote_table; + struct shared_table *last_local_table; /* Last table that emit update messages during a teach process */ + struct shared_table *stop_local_table; /* last evaluated table, used as restart point for the next teach process */ + struct shared_table *tables; + struct server *srv; + struct dcache *dcache; /* dictionary cache */ + struct peers *peers; /* associated peer section */ + struct peer *next; /* next peer in the list */ +}; + + +struct peers { + char *id; /* peer section name */ + struct task *sync_task; /* main sync task */ + struct sig_handler *sighandler; /* signal handler */ + struct peer *remote; /* remote peers list */ + struct peer *local; /* local peer list */ + struct proxy *peers_fe; /* peer frontend */ + struct { + const char *file; /* file where the section appears */ + int line; /* line where the section appears */ + } conf; /* config information */ + time_t last_change; + struct peers *next; /* next peer section */ + unsigned int flags; /* current peers section resync state */ + unsigned int resync_timeout; /* resync timeout timer */ + int count; /* total of peers */ + int nb_shards; /* Number of peer shards */ + int disabled; /* peers proxy disabled if >0 */ + int applet_count[MAX_THREADS]; /* applet count per thread */ +}; + +/* LRU cache for dictionaies */ +struct dcache_tx { + /* The last recently used key */ + unsigned int lru_key; + /* An array of entries to store pointers to dictionary entries. */ + struct ebpt_node *entries; + /* The previous lookup result. */ + struct ebpt_node *prev_lookup; + /* ebtree to store the previous entries. */ + struct eb_root cached_entries; +}; + +struct dcache_rx { + unsigned int id; + struct dict_entry *de; +}; + +struct dcache_tx_entry { + unsigned int id; + struct ebpt_node entry; +}; + +/* stick-table data type cache */ +struct dcache { + /* Cache used upon transmission */ + struct dcache_tx *tx; + /* Cache used upon receipt */ + struct dcache_rx *rx; + /* Maximum number of entries in this cache */ + size_t max_entries; +}; + +struct peers_keyword { + const char *kw; + int (*parse)( + char **args, + struct peers *curpeer, + const char *file, + int line, + char **err); + int flags; +}; + +struct peers_kw_list { + struct list list; + struct peers_keyword kw[VAR_ARRAY]; +}; + +#endif /* _HAPROXY_PEERS_T_H */ + diff --git a/include/haproxy/peers.h b/include/haproxy/peers.h new file mode 100644 index 0000000..e3c5fd3 --- /dev/null +++ b/include/haproxy/peers.h @@ -0,0 +1,69 @@ +/* + * include/haproxy/peers.h + * This file defines function prototypes for peers management. + * + * Copyright 2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PEERS_H +#define _HAPROXY_PEERS_H + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/obj_type.h> +#include <haproxy/peers-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/stream-t.h> + + +extern struct peers_kw_list peers_keywords; +extern struct peers *cfg_peers; + +int peers_init_sync(struct peers *peers); +int peers_alloc_dcache(struct peers *peers); +int peers_register_table(struct peers *, struct stktable *table); +void peers_setup_frontend(struct proxy *fe); +void peers_register_keywords(struct peers_kw_list *pkwl); + +#if defined(USE_OPENSSL) +static inline enum obj_type *peer_session_target(struct peer *p, struct stream *s) +{ + if (p->srv->use_ssl) + return &p->srv->obj_type; + else + return &s->be->obj_type; +} + +static inline struct xprt_ops *peer_xprt(struct peer *p) +{ + return p->srv->use_ssl ? xprt_get(XPRT_SSL) : xprt_get(XPRT_RAW); +} +#else +static inline enum obj_type *peer_session_target(struct peer *p, struct stream *s) +{ + return &s->be->obj_type; +} + +static inline struct xprt_ops *peer_xprt(struct peer *p) +{ + return xprt_get(XPRT_RAW); +} +#endif + +#endif /* _HAPROXY_PEERS_H */ + diff --git a/include/haproxy/pipe-t.h b/include/haproxy/pipe-t.h new file mode 100644 index 0000000..1a1fcfd --- /dev/null +++ b/include/haproxy/pipe-t.h @@ -0,0 +1,43 @@ +/* + * include/haproxy/pipe-t.h + * Pipe management - types definitions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PIPE_T_H +#define _HAPROXY_PIPE_T_H + +/* A pipe is described by its read and write FDs, and the data remaining in it. + * The FDs are valid if there are data pending. The user is not allowed to + * change the FDs. + */ +struct pipe { + int data; /* number of bytes present in the pipe */ + int prod; /* FD the producer must write to ; -1 if none */ + int cons; /* FD the consumer must read from ; -1 if none */ + struct pipe *next; +}; + +#endif /* _HAPROXY_PIPE_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/pipe.h b/include/haproxy/pipe.h new file mode 100644 index 0000000..12bd8ea --- /dev/null +++ b/include/haproxy/pipe.h @@ -0,0 +1,54 @@ +/* + * include/haproxy/pipe.h + * Pipe management - exported functions + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PIPE_H +#define _HAPROXY_PIPE_H + +#include <haproxy/api.h> +#include <haproxy/pipe-t.h> + +extern int pipes_used; /* # of pipes in use (2 fds each) */ +extern int pipes_free; /* # of pipes unused (2 fds each) */ + +/* return a pre-allocated empty pipe. Try to allocate one if there isn't any + * left. NULL is returned if a pipe could not be allocated. + */ +struct pipe *get_pipe(); + +/* destroy a pipe, possibly because an error was encountered on it. Its FDs + * will be closed and it will not be reinjected into the live pool. + */ +void kill_pipe(struct pipe *p); + +/* put back a unused pipe into the live pool. If it still has data in it, it is + * closed and not reinjected into the live pool. The caller is not allowed to + * use it once released. + */ +void put_pipe(struct pipe *p); + +#endif /* _HAPROXY_PIPE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/pool-os.h b/include/haproxy/pool-os.h new file mode 100644 index 0000000..cf29c58 --- /dev/null +++ b/include/haproxy/pool-os.h @@ -0,0 +1,109 @@ +/* + * include/haproxy/pool-os.h + * OS-level interface for memory management + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_POOL_OS_H +#define _HAPROXY_POOL_OS_H + +#include <sys/mman.h> +#include <stdlib.h> +#include <haproxy/api.h> + + +/************* normal allocator *************/ + +/* allocates an area of size <size> and returns it. The semantics are similar + * to those of malloc(). + */ +static forceinline void *pool_alloc_area(size_t size) +{ + return malloc(size); +} + +/* frees an area <area> of size <size> allocated by pool_alloc_area(). The + * semantics are identical to free() except that the size is specified and + * may be ignored. + */ +static forceinline void pool_free_area(void *area, size_t __maybe_unused size) +{ + will_free(area, size); + free(area); +} + +/************* use-after-free allocator *************/ + +/* allocates an area of size <size> and returns it. The semantics are similar + * to those of malloc(). However the allocation is rounded up to 4kB so that a + * full page is allocated. This ensures the object can be freed alone so that + * future dereferences are easily detected. The returned object is always + * 16-bytes aligned to avoid issues with unaligned structure objects. In case + * some padding is added, the area's start address is copied at the end of the + * padding to help detect underflows. + */ +static inline void *pool_alloc_area_uaf(size_t size) +{ + size_t pad = (4096 - size) & 0xFF0; + void *ret; + + ret = mmap(NULL, (size + 4095) & -4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (ret != MAP_FAILED) { + /* let's dereference the page before returning so that the real + * allocation in the system is performed without holding the lock. + */ + *(int *)ret = 0; + if (pad >= sizeof(void *)) + *(void **)(ret + pad - sizeof(void *)) = ret + pad; + ret += pad; + } else { + ret = NULL; + } + return ret; +} + +/* frees an area <area> of size <size> allocated by pool_alloc_area_uaf(). The + * semantics are identical to free() except that the size must absolutely match + * the one passed to pool_alloc_area_uaf(). In case some padding is added, the + * area's start address is compared to the one at the end of the padding, and + * a segfault is triggered if they don't match, indicating an underflow. + */ +static inline void pool_free_area_uaf(void *area, size_t size) +{ + size_t pad = (4096 - size) & 0xFF0; + + /* This object will be released for real in order to detect a use after + * free. We also force a write to the area to ensure we crash on double + * free or free of a const area. + */ + *(uint32_t *)area = 0xDEADADD4; + + if (pad >= sizeof(void *) && *(void **)(area - sizeof(void *)) != area) + ABORT_NOW(); + + munmap(area - pad, (size + 4095) & -4096); +} + +#endif /* _HAPROXY_POOL_OS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/pool-t.h b/include/haproxy/pool-t.h new file mode 100644 index 0000000..157e2ca --- /dev/null +++ b/include/haproxy/pool-t.h @@ -0,0 +1,149 @@ +/* + * include/haproxy/pool-t.h + * Memory pools configuration and type definitions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_POOL_T_H +#define _HAPROXY_POOL_T_H + +#include <haproxy/api-t.h> +#include <haproxy/list-t.h> + +#define MEM_F_SHARED 0x1 +#define MEM_F_EXACT 0x2 + +/* A special pointer for the pool's free_list that indicates someone is + * currently manipulating it. Serves as a short-lived lock. + */ +#define POOL_BUSY ((void *)1) + +#define POOL_AVG_SAMPLES 1024 + +/* possible flags for __pool_alloc() */ +#define POOL_F_NO_POISON 0x00000001 // do not poison the area +#define POOL_F_MUST_ZERO 0x00000002 // zero the returned area +#define POOL_F_NO_FAIL 0x00000004 // do not randomly fail + +/* pool debugging flags */ +#define POOL_DBG_FAIL_ALLOC 0x00000001 // randomly fail memory allocations +#define POOL_DBG_DONT_MERGE 0x00000002 // do not merge same-size pools +#define POOL_DBG_COLD_FIRST 0x00000004 // pick cold objects first +#define POOL_DBG_INTEGRITY 0x00000008 // perform integrity checks on cache +#define POOL_DBG_NO_GLOBAL 0x00000010 // disable global pools +#define POOL_DBG_NO_CACHE 0x00000020 // disable thread-local pool caches +#define POOL_DBG_CALLER 0x00000040 // trace last caller's location +#define POOL_DBG_TAG 0x00000080 // place a tag at the end of the area +#define POOL_DBG_POISON 0x00000100 // poison memory area on pool_alloc() +#define POOL_DBG_UAF 0x00000200 // enable use-after-free protection + + +/* This is the head of a thread-local cache */ +struct pool_cache_head { + struct list list; /* head of objects in this pool */ + unsigned int count; /* number of objects in this pool */ + unsigned int tid; /* thread id, for debugging only */ + struct pool_head *pool; /* assigned pool, for debugging only */ + ulong fill_pattern; /* pattern used to fill the area on free */ +} THREAD_ALIGNED(64); + +/* This represents one item stored in the thread-local cache. <by_pool> links + * the object to the list of objects in the pool, and <by_lru> links the object + * to the local thread's list of hottest objects. This way it's possible to + * allocate a fresh object from the cache, or to release cold objects from any + * pool (no bookkeeping is needed since shared pools do not know how many + * objects they store). + */ +struct pool_cache_item { + struct list by_pool; /* link to objects in this pool */ + struct list by_lru; /* link to objects by LRU order */ +}; + +/* This structure is used to represent an element in the pool's shared + * free_list. An item may carry a series of other items allocated or released + * as a same cluster. The storage then looks like this: + * +------+ +------+ +------+ + * -->| next |-->| next |-->| NULL | + * +------+ +------+ +------+ + * | NULL | | down | | down | + * +------+ +--|---+ +--|---+ + * V V + * +------+ +------+ + * | NULL | | NULL | + * +------+ +------+ + * | down | | NULL | + * +--|---+ +------+ + * V + * +------+ + * | NULL | + * +------+ + * | NULL | + * +------+ + */ +struct pool_item { + struct pool_item *next; + struct pool_item *down; // link to other items of the same cluster +}; + +/* This describes a complete pool, with its status, usage statistics and the + * thread-local caches if any. Even if pools are disabled, these descriptors + * are valid and are used at least to get names and sizes. For small builds + * using neither threads nor pools, this structure might be reduced, and + * alignment could be removed. + */ +struct pool_head { + /* read-mostly part, purely configuration */ + unsigned int limit; /* hard limit on the number of chunks */ + unsigned int minavail; /* how many chunks are expected to be used */ + unsigned int size; /* chunk size */ + unsigned int flags; /* MEM_F_* */ + unsigned int users; /* number of pools sharing this zone */ + unsigned int alloc_sz; /* allocated size (includes hidden fields) */ + struct list list; /* list of all known pools */ + void *base_addr; /* allocation address, for free() */ + char name[12]; /* name of the pool */ + + /* heavily read-write part */ + THREAD_ALIGN(64); + + /* these entries depend on the pointer value, they're used to reduce + * the contention on fast-changing values. The alignment here is + * important since the purpose is to lower the thread contention. + * The free_list and used/allocated are not related, the array is + * just meant to shard elements and there are no per-free_list stats. + */ + struct { + THREAD_ALIGN(64); + struct pool_item *free_list; /* list of free shared objects */ + unsigned int allocated; /* how many chunks have been allocated */ + unsigned int used; /* how many chunks are currently in use */ + unsigned int needed_avg;/* floating indicator between used and allocated */ + unsigned int failed; /* failed allocations (indexed by hash of TID) */ + } buckets[CONFIG_HAP_POOL_BUCKETS]; + + struct pool_cache_head cache[MAX_THREADS] THREAD_ALIGNED(64); /* pool caches */ +} __attribute__((aligned(64))); + +#endif /* _HAPROXY_POOL_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/pool.h b/include/haproxy/pool.h new file mode 100644 index 0000000..bf7cb8d --- /dev/null +++ b/include/haproxy/pool.h @@ -0,0 +1,368 @@ +/* + * include/haproxy/pool.h + * Memory management definitions.. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_POOL_H +#define _HAPROXY_POOL_H + +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/list.h> +#include <haproxy/pool-t.h> +#include <haproxy/thread.h> + +/* This registers a call to create_pool_callback(ptr, name, size) */ +#define REGISTER_POOL(ptr, name, size) \ + INITCALL3(STG_POOL, create_pool_callback, (ptr), (name), (size)) + +/* This macro declares a pool head <ptr> and registers its creation */ +#define DECLARE_POOL(ptr, name, size) \ + struct pool_head *(ptr) __read_mostly = NULL; \ + REGISTER_POOL(&ptr, name, size) + +/* This macro declares a static pool head <ptr> and registers its creation */ +#define DECLARE_STATIC_POOL(ptr, name, size) \ + static struct pool_head *(ptr) __read_mostly; \ + REGISTER_POOL(&ptr, name, size) + +/* By default, free objects are linked by a pointer stored at the beginning of + * the memory area. When DEBUG_MEMORY_POOLS is set, the allocated area is + * inflated by the size of a pointer so that the link is placed at the end + * of the objects. Hence free objects in pools remain intact. In addition, + * this location is used to keep a pointer to the pool the object was + * allocated from, and verify it's freed into the appropriate one. + */ +# define POOL_EXTRA_MARK (sizeof(void *)) +# define POOL_DEBUG_SET_MARK(pool, item) \ + do { \ + typeof(pool) __p = (pool); \ + typeof(item) __i = (item); \ + if (likely(!(pool_debugging & POOL_DBG_TAG))) \ + break; \ + *(typeof(pool)*)(((char *)__i) + __p->size) = __p; \ + } while (0) + +# define POOL_DEBUG_RESET_MARK(pool, item) \ + do { \ + typeof(pool) __p = (pool); \ + typeof(item) __i = (item); \ + if (likely(!(pool_debugging & POOL_DBG_TAG))) \ + break; \ + *(typeof(pool)*)(((char *)__i) + __p->size) = __builtin_return_address(0); \ + } while (0) + +# define POOL_DEBUG_CHECK_MARK(pool, item, caller) \ + do { \ + typeof(pool) __p = (pool); \ + typeof(item) __i = (item); \ + if (likely(!(pool_debugging & POOL_DBG_TAG))) \ + break; \ + if (*(typeof(pool)*)(((char *)__i) + __p->size) != __p) { \ + pool_inspect_item("tag mismatch on free()", pool, item, caller); \ + ABORT_NOW(); \ + } \ + } while (0) + +/* It's possible to trace callers of pool_free() by placing their pointer + * after the end of the area and the optional mark above, which means the + * end of the allocated array. + */ +# define POOL_EXTRA_CALLER (sizeof(void *)) +# define POOL_DEBUG_TRACE_CALLER(pool, item, caller) \ + do { \ + typeof(pool) __p = (pool); \ + typeof(item) __i = (item); \ + typeof(caller) __c = (caller); \ + if (likely(!(pool_debugging & POOL_DBG_CALLER))) \ + break; \ + *(typeof(caller)*)(((char *)__i) + __p->alloc_sz - sizeof(void*)) = __c; \ + } while (0) + +/* poison each newly allocated area with this byte if >= 0 */ +extern int mem_poison_byte; + +/* trim() in progress */ +extern int pool_trim_in_progress; + +/* set of POOL_DBG_* flags */ +extern uint pool_debugging; + +int malloc_trim(size_t pad); +void trim_all_pools(void); + +void *pool_get_from_os_noinc(struct pool_head *pool); +void pool_put_to_os_nodec(struct pool_head *pool, void *ptr); +void *pool_alloc_nocache(struct pool_head *pool, const void *caller); +void pool_free_nocache(struct pool_head *pool, void *ptr); +void dump_pools(void); +int pool_parse_debugging(const char *str, char **err); +int pool_total_failures(void); +unsigned long long pool_total_allocated(void); +unsigned long long pool_total_used(void); +void pool_flush(struct pool_head *pool); +void pool_gc(struct pool_head *pool_ctx); +struct pool_head *create_pool(char *name, unsigned int size, unsigned int flags); +void create_pool_callback(struct pool_head **ptr, char *name, unsigned int size); +void *pool_destroy(struct pool_head *pool); +void pool_destroy_all(void); +void *__pool_alloc(struct pool_head *pool, unsigned int flags); +void __pool_free(struct pool_head *pool, void *ptr); +void pool_inspect_item(const char *msg, struct pool_head *pool, const void *item, const void *caller); + + +/****************** Thread-local cache management ******************/ + +extern THREAD_LOCAL size_t pool_cache_bytes; /* total cache size */ +extern THREAD_LOCAL size_t pool_cache_count; /* #cache objects */ + +void pool_evict_from_local_cache(struct pool_head *pool, int full); +void pool_evict_from_local_caches(void); +void pool_put_to_cache(struct pool_head *pool, void *ptr, const void *caller); +void pool_fill_pattern(struct pool_cache_head *pch, struct pool_cache_item *item, uint size); +void pool_check_pattern(struct pool_cache_head *pch, struct pool_head *pool, struct pool_cache_item *item, const void *caller); +void pool_refill_local_from_shared(struct pool_head *pool, struct pool_cache_head *pch); +void pool_put_to_shared_cache(struct pool_head *pool, struct pool_item *item); + +/* returns the total number of allocated entries for a pool across all buckets */ +static inline uint pool_allocated(const struct pool_head *pool) +{ + int bucket; + uint ret; + + for (bucket = ret = 0; bucket < CONFIG_HAP_POOL_BUCKETS; bucket++) + ret += HA_ATOMIC_LOAD(&pool->buckets[bucket].allocated); + return ret; +} + +/* returns the total number of used entries for a pool across all buckets */ +static inline uint pool_used(const struct pool_head *pool) +{ + int bucket; + uint ret; + + for (bucket = ret = 0; bucket < CONFIG_HAP_POOL_BUCKETS; bucket++) + ret += HA_ATOMIC_LOAD(&pool->buckets[bucket].used); + return ret; +} + +/* returns the raw total number needed entries across all buckets. It must + * be passed to swrate_avg() to get something usable. + */ +static inline uint pool_needed_avg(const struct pool_head *pool) +{ + int bucket; + uint ret; + + for (bucket = ret = 0; bucket < CONFIG_HAP_POOL_BUCKETS; bucket++) + ret += HA_ATOMIC_LOAD(&pool->buckets[bucket].needed_avg); + return ret; +} + +/* returns the total number of failed allocations for a pool across all buckets */ +static inline uint pool_failed(const struct pool_head *pool) +{ + int bucket; + uint ret; + + for (bucket = ret = 0; bucket < CONFIG_HAP_POOL_BUCKETS; bucket++) + ret += HA_ATOMIC_LOAD(&pool->buckets[bucket].failed); + return ret; +} + +/* Returns the max number of entries that may be brought back to the pool + * before it's considered as full. Note that it is only usable for releasing + * objects, hence the function assumes that no more than ->used entries will + * be released in the worst case, and that this value is always lower than or + * equal to ->allocated. It's important to understand that under thread + * contention these values may not always be accurate but the principle is that + * any deviation remains contained. When global pools are disabled, this + * function always returns zero so that the caller knows it must free the + * object via other ways. + */ +static inline uint pool_releasable(const struct pool_head *pool) +{ + uint alloc, used; + uint needed_raw; + + if (unlikely(pool_debugging & (POOL_DBG_NO_CACHE|POOL_DBG_NO_GLOBAL))) + return 0; + + alloc = pool_allocated(pool); + used = pool_used(pool); + if (used > alloc) + alloc = used; + + needed_raw = pool_needed_avg(pool); + if (alloc < swrate_avg(needed_raw + needed_raw / 4, POOL_AVG_SAMPLES)) + return used; // less than needed is allocated, can release everything + + if ((uint)(alloc - used) < pool->minavail) + return pool->minavail - (alloc - used); // less than minimum available + + /* there are enough objects in this pool */ + return 0; +} + +/* These are generic cache-aware wrappers that allocate/free from/to the local + * cache first, then from the second level if it exists. + */ + +/* Tries to retrieve an object from the local pool cache corresponding to pool + * <pool>. If none is available, tries to allocate from the shared cache if any + * and returns NULL if nothing is available. Must not be used when pools are + * disabled. + */ +static inline void *pool_get_from_cache(struct pool_head *pool, const void *caller) +{ + struct pool_cache_item *item; + struct pool_cache_head *ph; + + BUG_ON(pool_debugging & POOL_DBG_NO_CACHE); + + ph = &pool->cache[tid]; + if (unlikely(LIST_ISEMPTY(&ph->list))) { + if (!(pool_debugging & POOL_DBG_NO_GLOBAL)) + pool_refill_local_from_shared(pool, ph); + if (LIST_ISEMPTY(&ph->list)) + return NULL; + } + + /* allocate hottest objects first */ + item = LIST_NEXT(&ph->list, typeof(item), by_pool); + + if (unlikely(pool_debugging & (POOL_DBG_COLD_FIRST|POOL_DBG_INTEGRITY))) { + /* allocate oldest objects first so as to keep them as long as possible + * in the cache before being reused and maximizing the chance to detect + * an overwrite. + */ + if (pool_debugging & POOL_DBG_COLD_FIRST) + item = LIST_PREV(&ph->list, typeof(item), by_pool); + + if (pool_debugging & POOL_DBG_INTEGRITY) + pool_check_pattern(ph, pool, item, caller); + } + + BUG_ON(&item->by_pool == &ph->list); + LIST_DELETE(&item->by_pool); + LIST_DELETE(&item->by_lru); + + /* keep track of where the element was allocated from */ + POOL_DEBUG_SET_MARK(pool, item); + POOL_DEBUG_TRACE_CALLER(pool, item, caller); + + ph->count--; + pool_cache_bytes -= pool->size; + pool_cache_count--; + + return item; +} + + +/****************** Common high-level code ******************/ + +#if !defined(DEBUG_MEM_STATS) + +/* + * Returns a pointer to an object from pool <pool> allocated using + * flags <flag> from the POOL_F_* set. + */ +#define pool_alloc_flag(pool, flag) __pool_alloc((pool), (flag)) + +/* + * Returns a pointer to type <type> taken from the pool <pool_type> or + * dynamically allocated. Memory poisonning is performed if enabled. + */ +#define pool_alloc(pool) __pool_alloc((pool), 0) + +/* + * Returns a pointer to type <type> taken from the pool <pool_type> or + * dynamically allocated. The area is zeroed. + */ +#define pool_zalloc(pool) __pool_alloc((pool), POOL_F_MUST_ZERO) + +/* + * Puts a memory area back to the corresponding pool. Just like with the libc's + * free(), <ptr> may be NULL. + */ +#define pool_free(pool, ptr) \ + do { \ + typeof(ptr) __ptr = (ptr); \ + if (likely((__ptr) != NULL)) \ + __pool_free(pool, __ptr); \ + } while (0) + + +#else /* DEBUG_MEM_STATS is set below */ + +#define pool_free(pool, ptr) ({ \ + struct pool_head *__pool = (pool); \ + typeof(ptr) __ptr = (ptr); \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_P_FREE, \ + .func = __func__, \ + }, \ + }; \ + _.extra = __pool; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + if (__ptr) { \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __pool->size); \ + __pool_free(__pool, __ptr); \ + } \ +}) + +#define pool_alloc_flag(pool, flag) ({ \ + struct pool_head *__pool = (pool); \ + uint __flag = (flag); \ + size_t __x = __pool->size; \ + static struct mem_stats _ __attribute__((used,__section__("mem_stats"),__aligned__(sizeof(void*)))) = { \ + .caller = { \ + .file = __FILE__, .line = __LINE__, \ + .what = MEM_STATS_TYPE_P_ALLOC, \ + .func = __func__, \ + }, \ + }; \ + _.extra = __pool; \ + HA_WEAK(__start_mem_stats); \ + HA_WEAK(__stop_mem_stats); \ + _HA_ATOMIC_INC(&_.calls); \ + _HA_ATOMIC_ADD(&_.size, __x); \ + __pool_alloc(__pool, __flag); \ +}) + +#define pool_alloc(pool) pool_alloc_flag(pool, 0) + +#define pool_zalloc(pool) pool_alloc_flag(pool, POOL_F_MUST_ZERO) + +#endif /* DEBUG_MEM_STATS */ + +#endif /* _HAPROXY_POOL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/port_range-t.h b/include/haproxy/port_range-t.h new file mode 100644 index 0000000..eea1132 --- /dev/null +++ b/include/haproxy/port_range-t.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/port_range-t.h + * This file defines the prt_range type + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PORT_RANGE_T_H +#define _HAPROXY_PORT_RANGE_T_H + +#include <netinet/in.h> +#include <haproxy/api-t.h> + +struct port_range { + int size, get, put_h, put_t; /* range size, and get/put positions */ + uint16_t ports[VAR_ARRAY]; /* array of <size> ports, in host byte order */ +}; + +#endif /* _HAPROXY_PORT_RANGE_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/port_range.h b/include/haproxy/port_range.h new file mode 100644 index 0000000..9e4379a --- /dev/null +++ b/include/haproxy/port_range.h @@ -0,0 +1,105 @@ +/* + * include/haproxy/port_range.h + * This file defines everything needed to manage port ranges + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PORT_RANGE_H +#define _HAPROXY_PORT_RANGE_H + +#include <stdlib.h> +#include <haproxy/api.h> +#include <haproxy/port_range-t.h> + +#define GET_NEXT_OFF(range, off) ((off) == (range)->size - 1 ? 0 : (off) + 1) + +/* return an available port from range <range>, or zero if none is left */ +static inline int port_range_alloc_port(struct port_range *range) +{ + int ret; + int get; + int put; + + get = _HA_ATOMIC_LOAD(&range->get); + do { + /* barrier to make sure get is loaded before put */ + __ha_barrier_atomic_load(); + put = _HA_ATOMIC_LOAD(&range->put_t); + if (unlikely(put == get)) + return 0; + ret = range->ports[get]; + } while (!(_HA_ATOMIC_CAS(&range->get, &get, GET_NEXT_OFF(range, get)))); + return ret; +} + +/* release port <port> into port range <range>. Does nothing if <port> is zero + * nor if <range> is null. The caller is responsible for marking the port + * unused by either setting the port to zero or the range to NULL. + */ +static inline void port_range_release_port(struct port_range *range, int port) +{ + int put; + + if (!port || !range) + return; + + put = range->put_h; + /* put_h is reserved for producers, so that they can each get a + * free slot, put_t is what is used by consumers to know if there's + * elements available or not + */ + /* First reserve or slot, we know the ring buffer can't be full, + * as we will only ever release port we allocated before + */ + while (!(_HA_ATOMIC_CAS(&range->put_h, &put, GET_NEXT_OFF(range, put)))); + _HA_ATOMIC_STORE(&range->ports[put], port); + /* Barrier to make sure the new port is visible before we change put_t */ + __ha_barrier_atomic_store(); + /* Wait until all the threads that got a slot before us are done */ + while ((volatile int)range->put_t != put) + __ha_compiler_barrier(); + /* Let the world know we're done, and any potential consumer they + * can use that port. + */ + _HA_ATOMIC_STORE(&range->put_t, GET_NEXT_OFF(range, put)); +} + +/* return a new initialized port range of N ports. The ports are not + * filled in, it's up to the caller to do it. + */ +static inline struct port_range *port_range_alloc_range(int n) +{ + struct port_range *ret; + ret = calloc(1, sizeof(struct port_range) + + (n + 1) * sizeof(((struct port_range *)0)->ports[0])); + if (!ret) + return NULL; + ret->size = n + 1; + /* Start at the first free element */ + ret->put_h = ret->put_t = n; + return ret; +} + +#endif /* _HAPROXY_PORT_RANGE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/proto_quic.h b/include/haproxy/proto_quic.h new file mode 100644 index 0000000..a0e2b98 --- /dev/null +++ b/include/haproxy/proto_quic.h @@ -0,0 +1,35 @@ +/* + * AF_INET/AF_INET6 QUIC protocol layer definitions. + * + * Copyright 2020 Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTO_QUIC_H +#define _HAPROXY_PROTO_QUIC_H + +extern struct protocol proto_quic4; +extern struct protocol proto_quic6; + +struct quic_cid_tree { + struct eb_root root; + __decl_thread(HA_RWLOCK_T lock); +}; + +extern struct quic_dghdlr *quic_dghdlrs; +extern struct quic_cid_tree *quic_cid_trees; + +#endif /* _HAPROXY_PROTO_QUIC_H */ diff --git a/include/haproxy/proto_rhttp-t.h b/include/haproxy/proto_rhttp-t.h new file mode 100644 index 0000000..28e2ff9 --- /dev/null +++ b/include/haproxy/proto_rhttp-t.h @@ -0,0 +1,14 @@ +#ifndef _HAPROXY_PROTO_RHTTP_H_T +#define _HAPROXY_PROTO_RHTTP_H_T + +/* State for reverse preconnect listener state machine. + * Used to limit log reporting only on state changes. + */ +enum li_preconn_state { + LI_PRECONN_ST_STOP, /* pre-connect task inactive */ + LI_PRECONN_ST_INIT, /* pre-connect task bootstrapped */ + LI_PRECONN_ST_ERR, /* last pre-connect attempt failed */ + LI_PRECONN_ST_FULL, /* pre-connect maxconn reached */ +}; + +#endif /* _HAPROXY_PROTO_RHTTP_H_T */ diff --git a/include/haproxy/proto_rhttp.h b/include/haproxy/proto_rhttp.h new file mode 100644 index 0000000..421680f --- /dev/null +++ b/include/haproxy/proto_rhttp.h @@ -0,0 +1,21 @@ +#ifndef _HAPROXY_PROTO_RHTTP_H +#define _HAPROXY_PROTO_RHTTP_H + +#include <haproxy/connection-t.h> +#include <haproxy/listener-t.h> +#include <haproxy/receiver-t.h> + +int rhttp_bind_receiver(struct receiver *rx, char **errmsg); + +int rhttp_bind_listener(struct listener *listener, char *errmsg, int errlen); +void rhttp_enable_listener(struct listener *l); +void rhttp_disable_listener(struct listener *l); +struct connection *rhttp_accept_conn(struct listener *l, int *status); +void rhttp_unbind_receiver(struct listener *l); +int rhttp_set_affinity(struct connection *conn, int new_tid); + +int rhttp_accepting_conn(const struct receiver *rx); + +void rhttp_notify_preconn_err(struct listener *l); + +#endif /* _HAPROXY_PROTO_RHTTP_H */ diff --git a/include/haproxy/proto_sockpair.h b/include/haproxy/proto_sockpair.h new file mode 100644 index 0000000..bb0256e --- /dev/null +++ b/include/haproxy/proto_sockpair.h @@ -0,0 +1,32 @@ +/* + * Socket Pair protocol layer (sockpair) + * + * Copyright HAProxy Technologies - William Lallemand <wlallemand@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTO_SOCKPAIR_H +#define _HAPROXY_PROTO_SOCKPAIR_H + +extern struct proto_fam proto_fam_sockpair; +extern struct protocol proto_sockpair; + +int recv_fd_uxst(int sock); +int send_fd_uxst(int fd, int send_fd); +int sockpair_bind_receiver(struct receiver *rx, char **errmsg); + +#endif /* _HAPROXY_PROTO_SOCKPAIR_H */ + diff --git a/include/haproxy/proto_tcp.h b/include/haproxy/proto_tcp.h new file mode 100644 index 0000000..8a3d9fd --- /dev/null +++ b/include/haproxy/proto_tcp.h @@ -0,0 +1,45 @@ +/* + * include/haproxy/proto_tcp.h + * This file contains TCP socket protocol definitions. + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTO_TCP_H +#define _HAPROXY_PROTO_TCP_H + +#include <haproxy/api.h> +#include <haproxy/arg-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/listener-t.h> +#include <haproxy/sample-t.h> + +extern struct protocol proto_tcpv4; +extern struct protocol proto_tcpv6; + +int tcp_bind_socket(int fd, int flags, struct sockaddr_storage *local, struct sockaddr_storage *remote); +int tcp_connect_server(struct connection *conn, int flags); +int tcp_is_foreign(int fd, sa_family_t family); + +#endif /* _HAPROXY_PROTO_TCP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/proto_udp.h b/include/haproxy/proto_udp.h new file mode 100644 index 0000000..1c4da77 --- /dev/null +++ b/include/haproxy/proto_udp.h @@ -0,0 +1,41 @@ +/* + * include/haproxy/proto_udp.h + * This file contains UDP socket protocol definitions. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * Partial merge by Emeric Brun <ebrun@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _PROTO_PROTO_UDP_H +#define _PROTO_PROTO_UDP_H + +extern struct protocol proto_udp4; +extern struct protocol proto_udp6; + +int udp_bind_socket(int fd, int flags, struct sockaddr_storage *local, struct sockaddr_storage *remote); +int udp_suspend_receiver(struct receiver *rx); +int udp_resume_receiver(struct receiver *rx); + +#endif /* _PROTO_PROTO_UDP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/proto_uxst.h b/include/haproxy/proto_uxst.h new file mode 100644 index 0000000..77caf3d --- /dev/null +++ b/include/haproxy/proto_uxst.h @@ -0,0 +1,34 @@ +/* + * include/haproxy/proto_uxst.h + * This file contains UNIX stream socket protocol definitions. + * + * Copyright (C) 2000-2013 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _PROTO_PROTO_UXST_H +#define _PROTO_PROTO_UXST_H + +extern struct protocol proto_uxst; + +#endif /* _PROTO_PROTO_UXST_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/protobuf-t.h b/include/haproxy/protobuf-t.h new file mode 100644 index 0000000..b1a14e7 --- /dev/null +++ b/include/haproxy/protobuf-t.h @@ -0,0 +1,87 @@ +/* + * include/haproxy/protobuf-t.h + * This file contains structure declarations for protocol buffers. + * + * Copyright 2012 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTOBUF_T_H +#define _HAPROXY_PROTOBUF_T_H + +#include <haproxy/api-t.h> + +enum protobuf_wire_type { + PBUF_TYPE_VARINT, + PBUF_TYPE_64BIT, + PBUF_TYPE_LENGTH_DELIMITED, + PBUF_TYPE_START_GROUP, /* Deprecated */ + PBUF_TYPE_STOP_GROUP, /* Deprecated */ + PBUF_TYPE_32BIT, +}; + +enum protobuf_type { + /* These enums are used to initialize calloc()'ed struct fields. + * Start them from 1 to avoid collisions with the default 0 value + * of such struct fields. + */ + PBUF_T_BINARY = 1, + + /* Do not reorder the following ones: + * PBUF_T_VARINT_*, PBUF_T_32BIT_* and PBUF_T_64BIT_* + */ + PBUF_T_VARINT_INT32, + PBUF_T_VARINT_UINT32, + PBUF_T_VARINT_INT64, + PBUF_T_VARINT_UINT64, + PBUF_T_VARINT_BOOL, + PBUF_T_VARINT_ENUM, + + /* These two following varints are first encoded with zigzag. */ + PBUF_T_VARINT_SINT32, + PBUF_T_VARINT_SINT64, + + /* Fixed size types from here. */ + PBUF_T_32BIT_FIXED32, + PBUF_T_32BIT_SFIXED32, + PBUF_T_32BIT_FLOAT, + + PBUF_T_64BIT_FIXED64, + PBUF_T_64BIT_SFIXED64, + PBUF_T_64BIT_DOUBLE, +}; + + +struct pbuf_fid { + unsigned int *ids; + size_t sz; +}; + +struct sample; +struct protobuf_parser_def { + int (*skip)(unsigned char **pos, size_t *left, size_t vlen); + int (*smp_store)(struct sample *, int type, + unsigned char *pos, size_t left, size_t vlen); +}; + +#endif /* _HAPROXY_PROTOBUF_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/protobuf.h b/include/haproxy/protobuf.h new file mode 100644 index 0000000..009bd13 --- /dev/null +++ b/include/haproxy/protobuf.h @@ -0,0 +1,577 @@ +/* + * include/haproxy/protobuf.h + * This file contains functions and macros declarations for protocol buffers decoding. + * + * Copyright 2012 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTOBUF_H +#define _HAPROXY_PROTOBUF_H + +#include <haproxy/api-t.h> +#include <haproxy/arg-t.h> +#include <haproxy/protobuf-t.h> +#include <haproxy/sample-t.h> + +#define PBUF_VARINT_DONT_STOP_BIT 7 +#define PBUF_VARINT_DONT_STOP_BITMASK (1 << PBUF_VARINT_DONT_STOP_BIT) +#define PBUF_VARINT_DATA_BITMASK ~PBUF_VARINT_DONT_STOP_BITMASK + +/* .skip and .smp_store prototypes. */ +int protobuf_skip_varint(unsigned char **pos, size_t *len, size_t vlen); +int protobuf_smp_store_varint(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen); +int protobuf_skip_64bit(unsigned char **pos, size_t *len, size_t vlen); +int protobuf_smp_store_64bit(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen); +int protobuf_skip_vlen(unsigned char **pos, size_t *len, size_t vlen); +int protobuf_smp_store_vlen(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen); +int protobuf_skip_32bit(unsigned char **pos, size_t *len, size_t vlen); +int protobuf_smp_store_32bit(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen); + +struct protobuf_parser_def protobuf_parser_defs [] = { + [PBUF_TYPE_VARINT ] = { + .skip = protobuf_skip_varint, + .smp_store = protobuf_smp_store_varint, + }, + [PBUF_TYPE_64BIT ] = { + .skip = protobuf_skip_64bit, + .smp_store = protobuf_smp_store_64bit, + }, + [PBUF_TYPE_LENGTH_DELIMITED] = { + .skip = protobuf_skip_vlen, + .smp_store = protobuf_smp_store_vlen, + }, + [PBUF_TYPE_START_GROUP ] = { + /* XXX Deprecated XXX */ + }, + [PBUF_TYPE_STOP_GROUP ] = { + /* XXX Deprecated XXX */ + }, + [PBUF_TYPE_32BIT ] = { + .skip = protobuf_skip_32bit, + .smp_store = protobuf_smp_store_32bit, + }, +}; + +/* + * Note that the field values with protocol buffers 32bit and 64bit fixed size as type + * are sent in little-endian byte order to the network. + */ + +/* Convert a little-endian ordered 32bit integer to the byte order of the host. */ +static inline uint32_t pbuf_le32toh(uint32_t v) +{ + uint8_t *p = (uint8_t *)&v; + return (p[0] | (p[1] << 8) | (p[2] << 16) | (p[3] << 24)); +} + +/* Convert a little-endian ordered 64bit integer to the byte order of the host. */ +static inline uint64_t pbuf_le64toh(uint64_t v) +{ + return (uint64_t)(pbuf_le32toh(v >> 32)) << 32 | pbuf_le32toh(v); +} + +/* + * Return a protobuf type enum from <s> string if succeeded, -1 if not. + */ +int protobuf_type(const char *s) +{ + /* varint types. */ + if (strcmp(s, "int32") == 0) + return PBUF_T_VARINT_INT32; + else if (strcmp(s, "uint32") == 0) + return PBUF_T_VARINT_UINT32; + else if (strcmp(s, "sint32") == 0) + return PBUF_T_VARINT_SINT32; + else if (strcmp(s, "int64") == 0) + return PBUF_T_VARINT_INT64; + else if (strcmp(s, "uint64") == 0) + return PBUF_T_VARINT_UINT64; + else if (strcmp(s, "sint64") == 0) + return PBUF_T_VARINT_SINT64; + else if (strcmp(s, "bool") == 0) + return PBUF_T_VARINT_BOOL; + else if (strcmp(s, "enum") == 0) + return PBUF_T_VARINT_ENUM; + + /* 32bit fixed size types. */ + else if (strcmp(s, "fixed32") == 0) + return PBUF_T_32BIT_FIXED32; + else if (strcmp(s, "sfixed32") == 0) + return PBUF_T_32BIT_SFIXED32; + else if (strcmp(s, "float") == 0) + return PBUF_T_32BIT_FLOAT; + + /* 64bit fixed size types. */ + else if (strcmp(s, "fixed64") == 0) + return PBUF_T_64BIT_FIXED64; + else if (strcmp(s, "sfixed64") == 0) + return PBUF_T_64BIT_SFIXED64; + else if (strcmp(s, "double") == 0) + return PBUF_T_64BIT_DOUBLE; + else + return -1; +} + +/* + * Decode a protocol buffers varint located in a buffer at <pos> address with + * <len> as length. The decoded value is stored at <val>. + * Returns 1 if succeeded, 0 if not. + */ +static inline int +protobuf_varint(uint64_t *val, unsigned char *pos, size_t len) +{ + unsigned int shift; + + *val = 0; + shift = 0; + + while (len > 0) { + int stop = !(*pos & PBUF_VARINT_DONT_STOP_BITMASK); + + *val |= ((uint64_t)(*pos & PBUF_VARINT_DATA_BITMASK)) << shift; + + ++pos; + --len; + + if (stop) + break; + else if (!len) + return 0; + + shift += 7; + /* The maximum length in bytes of a 64-bit encoded value is 10. */ + if (shift > 63) + return 0; + } + + return 1; +} + +/* + * Decode a protocol buffers varint located in a buffer at <pos> offset address with + * <len> as length address. Update <pos> and <len> consequently. Decrease <*len> + * by the number of decoded bytes. The decoded value is stored at <val>. + * Returns 1 if succeeded, 0 if not. + */ +static inline int +protobuf_decode_varint(uint64_t *val, unsigned char **pos, size_t *len) +{ + unsigned int shift; + + *val = 0; + shift = 0; + + while (*len > 0) { + int stop = !(**pos & PBUF_VARINT_DONT_STOP_BITMASK); + + *val |= ((uint64_t)**pos & PBUF_VARINT_DATA_BITMASK) << shift; + + ++*pos; + --*len; + + if (stop) + break; + else if (!*len) + return 0; + + shift += 7; + /* The maximum length in bytes of a 64-bit encoded value is 10. */ + if (shift > 63) + return 0; + } + + return 1; +} + +/* + * Skip a protocol buffer varint found at <pos> as position address with <len> + * as available length address. Update <*pos> to make it point to the next + * available byte. Decrease <*len> by the number of skipped bytes. + * Returns 1 if succeeded, 0 if not. + */ +int +protobuf_skip_varint(unsigned char **pos, size_t *len, size_t vlen) +{ + unsigned int shift; + + shift = 0; + + while (*len > 0) { + int stop = !(**pos & PBUF_VARINT_DONT_STOP_BITMASK); + + ++*pos; + --*len; + + if (stop) + break; + else if (!*len) + return 0; + + shift += 7; + /* The maximum length in bytes of a 64-bit encoded value is 10. */ + if (shift > 63) + return 0; + } + + return 1; +} + +/* + * If succeeded, return the length of a prococol buffers varint found at <pos> as + * position address, with <len> as address of the available bytes at <*pos>. + * Update <*pos> to make it point to the next available byte. Decrease <*len> + * by the number of bytes used to encode this varint. + * Return -1 if failed. + */ +static inline int +protobuf_varint_getlen(unsigned char *pos, size_t len) +{ + unsigned char *spos; + unsigned int shift; + + shift = 0; + spos = pos; + + while (len > 0) { + int stop = !(*pos & PBUF_VARINT_DONT_STOP_BITMASK); + + ++pos; + --len; + + if (stop) + break; + else if (!len) + return -1; + + shift += 7; + /* The maximum length in bytes of a 64-bit encoded value is 10. */ + if (shift > 63) + return -1; + } + + return pos - spos; +} + +/* + * Store a varint field value in a sample from <pos> buffer + * with <len> available bytes after having decoded it if needed + * depending on <type> the expected protocol buffer type of the field. + * Return 1 if succeeded, 0 if not. + */ +int protobuf_smp_store_varint(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen) +{ + switch (type) { + case PBUF_T_BINARY: + { + int varint_len; + + varint_len = protobuf_varint_getlen(pos, len); + if (varint_len == -1) + return 0; + + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = (char *)pos; + smp->data.u.str.data = varint_len; + smp->flags = SMP_F_VOL_TEST; + break; + } + + case PBUF_T_VARINT_INT32 ... PBUF_T_VARINT_ENUM: + { + uint64_t varint; + + if (!protobuf_varint(&varint, pos, len)) + return 0; + + smp->data.u.sint = varint; + smp->data.type = SMP_T_SINT; + break; + } + + case PBUF_T_VARINT_SINT32 ... PBUF_T_VARINT_SINT64: + { + uint64_t varint; + + if (!protobuf_varint(&varint, pos, len)) + return 0; + + /* zigzag decoding. */ + smp->data.u.sint = (varint >> 1) ^ -(varint & 1); + smp->data.type = SMP_T_SINT; + break; + } + + default: + return 0; + + } + + return 1; +} + +/* + * Move forward <*pos> buffer by 8 bytes. Used to skip a 64bit field. + */ +int protobuf_skip_64bit(unsigned char **pos, size_t *len, size_t vlen) +{ + if (*len < sizeof(uint64_t)) + return 0; + + *pos += sizeof(uint64_t); + *len -= sizeof(uint64_t); + + return 1; +} + +/* + * Store a fixed size 64bit field value in a sample from <pos> buffer + * with <len> available bytes after having decoded it depending on <type> + * the expected protocol buffer type of the field. + * Return 1 if succeeded, 0 if not. + */ +int protobuf_smp_store_64bit(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen) +{ + if (len < sizeof(uint64_t)) + return 0; + + switch (type) { + case PBUF_T_BINARY: + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = (char *)pos; + smp->data.u.str.data = sizeof(uint64_t); + smp->flags = SMP_F_VOL_TEST; + break; + + case PBUF_T_64BIT_FIXED64: + case PBUF_T_64BIT_SFIXED64: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = pbuf_le64toh(*(uint64_t *)pos); + smp->flags = SMP_F_VOL_TEST; + break; + + case PBUF_T_64BIT_DOUBLE: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = pbuf_le64toh(*(double *)pos); + smp->flags = SMP_F_VOL_TEST; + break; + + default: + return 0; + } + + return 1; +} + +/* + * Move forward <*pos> buffer by <vlen> bytes. Use to skip a length-delimited + * field. + */ +int protobuf_skip_vlen(unsigned char **pos, size_t *len, size_t vlen) +{ + if (*len < vlen) + return 0; + + *pos += vlen; + *len -= vlen; + + return 1; +} + +/* + * Store a <vlen>-bytes length-delimited field value in a sample from <pos> + * buffer with <len> available bytes. + * Return 1 if succeeded, 0 if not. + */ +int protobuf_smp_store_vlen(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen) +{ + if (len < vlen) + return 0; + + if (type != PBUF_T_BINARY) + return 0; + + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = (char *)pos; + smp->data.u.str.data = vlen; + smp->flags = SMP_F_VOL_TEST; + + return 1; +} + +/* + * Move forward <*pos> buffer by 4 bytes. Used to skip a 32bit field. + */ +int protobuf_skip_32bit(unsigned char **pos, size_t *len, size_t vlen) +{ + if (*len < sizeof(uint32_t)) + return 0; + + *pos += sizeof(uint32_t); + *len -= sizeof(uint32_t); + + return 1; +} + +/* + * Store a fixed size 32bit field value in a sample from <pos> buffer + * with <len> available bytes after having decoded it depending on <type> + * the expected protocol buffer type of the field. + * Return 1 if succeeded, 0 if not. + */ +int protobuf_smp_store_32bit(struct sample *smp, int type, + unsigned char *pos, size_t len, size_t vlen) +{ + if (len < sizeof(uint32_t)) + return 0; + + switch (type) { + case PBUF_T_BINARY: + smp->data.type = SMP_T_BIN; + smp->data.u.str.area = (char *)pos; + smp->data.u.str.data = sizeof(uint32_t); + smp->flags = SMP_F_VOL_TEST; + break; + + case PBUF_T_32BIT_FIXED32: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = pbuf_le32toh(*(uint32_t *)pos); + smp->flags = SMP_F_VOL_TEST; + break; + + case PBUF_T_32BIT_SFIXED32: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = (int32_t)pbuf_le32toh(*(uint32_t *)pos); + smp->flags = SMP_F_VOL_TEST; + break; + + case PBUF_T_32BIT_FLOAT: + smp->data.type = SMP_T_SINT; + smp->data.u.sint = pbuf_le32toh(*(float *)pos); + smp->flags = SMP_F_VOL_TEST; + break; + + default: + return 0; + } + + return 1; +} + +/* + * Lookup for a protocol buffers field whose parameters are provided by <arg_p> + * first argument in the buffer with <pos> as address and <len> as length address. + * If found, store its value depending on the type of storage to use provided by <arg_p> + * second argument and return 1, 0 if not. + */ +static inline int protobuf_field_lookup(const struct arg *arg_p, struct sample *smp, + unsigned char **pos, size_t *len) +{ + unsigned int *fid; + size_t fid_sz; + int type; + uint64_t elen; + int field; + + fid = arg_p[0].data.fid.ids; + fid_sz = arg_p[0].data.fid.sz; + type = arg_p[1].data.sint; + + /* Length of the length-delimited messages if any. */ + elen = 0; + field = 0; + + while (field < fid_sz) { + int found; + uint64_t key, sleft; + struct protobuf_parser_def *pbuf_parser = NULL; + unsigned int wire_type, field_number; + + if ((ssize_t)*len <= 0) + return 0; + + /* Remaining bytes saving. */ + sleft = *len; + + /* Key decoding */ + if (!protobuf_decode_varint(&key, pos, len)) + return 0; + + wire_type = key & 0x7; + field_number = key >> 3; + found = field_number == fid[field]; + + /* Skip the data if the current field does not match. */ + switch (wire_type) { + case PBUF_TYPE_VARINT: + case PBUF_TYPE_32BIT: + case PBUF_TYPE_64BIT: + pbuf_parser = &protobuf_parser_defs[wire_type]; + if (!found && !pbuf_parser->skip(pos, len, 0)) + return 0; + break; + + case PBUF_TYPE_LENGTH_DELIMITED: + /* Decode the length of this length-delimited field. */ + if (!protobuf_decode_varint(&elen, pos, len) || elen > *len) + return 0; + + /* The size of the current field is computed from here to skip + * the bytes used to encode the previous length.* + */ + sleft = *len; + pbuf_parser = &protobuf_parser_defs[wire_type]; + if (!found && !pbuf_parser->skip(pos, len, elen)) + return 0; + break; + + default: + return 0; + } + + /* Store the data if found. Note that <pbuf_parser> is not NULL */ + if (found && field == fid_sz - 1) + return pbuf_parser->smp_store(smp, type, *pos, *len, elen); + + if ((ssize_t)(elen) > 0) + elen -= sleft - *len; + + if (found) { + field++; + } + else if ((ssize_t)elen <= 0) { + field = 0; + } + } + + return 0; +} + +#endif /* _HAPROXY_PROTOBUF_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/protocol-t.h b/include/haproxy/protocol-t.h new file mode 100644 index 0000000..b85f29c --- /dev/null +++ b/include/haproxy/protocol-t.h @@ -0,0 +1,148 @@ +/* + * include/haproxy/protocol-t.h + * This file defines the structures used by generic network protocols. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTOCOL_T_H +#define _HAPROXY_PROTOCOL_T_H + +#include <sys/types.h> +#include <sys/socket.h> + +#include <import/ebtree-t.h> +#include <haproxy/api-t.h> + +/* some pointer types referenced below */ +struct listener; +struct receiver; +struct connection; + +/* + * Custom network family for str2sa parsing. Should be ok to do this since + * sa_family_t is standardized as an unsigned integer + */ +#define AF_CUST_EXISTING_FD (AF_MAX + 1) +#define AF_CUST_SOCKPAIR (AF_MAX + 2) +#define AF_CUST_RHTTP_SRV (AF_MAX + 3) +#define AF_CUST_MAX (AF_MAX + 4) + +/* + * Test in case AF_CUST_MAX overflows the sa_family_t (unsigned int) + */ +#if (AF_CUST_MAX < AF_MAX) +# error "Can't build on the target system, AF_CUST_MAX overflow" +#endif + +/* socket-level protocol types, used for protocol selection */ +enum proto_type { + PROTO_TYPE_STREAM, /* streaming protocol (like TCP) */ + PROTO_TYPE_DGRAM, /* datagram protocol (like UDP) */ + PROTO_NUM_TYPES /* must be the last one */ +}; + +/* max length of a protocol name, including trailing zero */ +#define PROTO_NAME_LEN 16 + +/* flags for ->connect() */ +#define CONNECT_HAS_DATA 0x00000001 /* There's data available to be sent */ +#define CONNECT_DELACK_SMART_CONNECT 0x00000002 /* Use a delayed ACK if the backend has tcp-smart-connect */ +#define CONNECT_DELACK_ALWAYS 0x00000004 /* Use a delayed ACK */ +#define CONNECT_CAN_USE_TFO 0x00000008 /* We can use TFO for this connection */ + +/* Flags for protocol->flags */ +#define PROTO_F_REUSEPORT_SUPPORTED 0x00000001 /* SO_REUSEPORT is supported */ +#define PROTO_F_REUSEPORT_TESTED 0x00000002 /* SO_REUSEPORT support was tested */ + +/* protocol families define standard functions acting on a given address family + * for a socket implementation, such as AF_INET/PF_INET for example. + */ +struct proto_fam { + char name[PROTO_NAME_LEN]; /* family name, zero-terminated */ + int sock_domain; /* socket domain, as passed to socket() */ + sa_family_t sock_family; /* socket family, for sockaddr */ + ushort l3_addrlen; /* layer3 address length, used by hashes */ + socklen_t sock_addrlen; /* socket address length, used by bind() */ + /* 4-bytes hole here */ + int (*addrcmp)(const struct sockaddr_storage *, const struct sockaddr_storage *); /* compare addresses (like memcmp) */ + int (*bind)(struct receiver *rx, char **errmsg); /* bind a receiver */ + int (*get_src)(int fd, struct sockaddr *, socklen_t, int dir); /* syscall used to retrieve connection's src addr */ + int (*get_dst)(int fd, struct sockaddr *, socklen_t, int dir); /* syscall used to retrieve connection's dst addr */ + void (*set_port)(struct sockaddr_storage *, int port); /* set the port on the address; NULL if not implemented */ +}; + +/* This structure contains all information needed to easily handle a protocol. + * Its primary goal is to ease listeners maintenance. Specifically, the + * bind() primitive must be used before any fork(). rx_suspend()/rx_resume() + * return >0 on success, 0 if rx stopped, -1 on failure to proceed. rx_* may + * be null if the protocol doesn't provide direct access to the receiver. + */ +struct protocol { + char name[PROTO_NAME_LEN]; /* protocol name, zero-terminated */ + struct proto_fam *fam; /* protocol family */ + int xprt_type; /* transport layer type (PROTO_TYPE_STREAM/PROTO_TYPE_DGRAM) */ + enum proto_type proto_type; /* protocol type at the socket layer (PROTO_TYPE_*) */ + int sock_type; /* socket type, as passed to socket() */ + int sock_prot; /* socket protocol, as passed to socket() */ + + /* functions acting on the listener */ + void (*add)(struct protocol *p, struct listener *l); /* add a listener for this protocol */ + int (*listen)(struct listener *l, char *errmsg, int errlen); /* start a listener */ + void (*enable)(struct listener *l); /* enable receipt of new connections */ + void (*disable)(struct listener *l); /* disable receipt of new connections */ + void (*unbind)(struct listener *l); /* unbind the listener and possibly its receiver */ + int (*suspend)(struct listener *l); /* try to suspend the listener */ + int (*resume)(struct listener *l); /* try to resume a suspended listener */ + struct connection *(*accept_conn)(struct listener *l, int *status); /* accept a new connection */ + + /* functions acting on connections */ + void (*ctrl_init)(struct connection *); /* completes initialization of the connection */ + void (*ctrl_close)(struct connection *); /* completes release of the connection */ + int (*connect)(struct connection *, int flags); /* connect function if any, see below for flags values */ + int (*drain)(struct connection *); /* drain pending data; 0=failed, >0=success */ + int (*check_events)(struct connection *conn, int event_type); /* subscribe to socket events */ + void (*ignore_events)(struct connection *conn, int event_type); /* unsubscribe from socket events */ + int (*get_src)(struct connection *conn, struct sockaddr *, socklen_t); /* retrieve connection's source address; -1=fail */ + int (*get_dst)(struct connection *conn, struct sockaddr *, socklen_t); /* retrieve connection's dest address; -1=fail */ + int (*set_affinity)(struct connection *conn, int new_tid); + + /* functions acting on the receiver */ + int (*rx_suspend)(struct receiver *rx); /* temporarily suspend this receiver for a soft restart */ + int (*rx_resume)(struct receiver *rx); /* try to resume a temporarily suspended receiver */ + void (*rx_enable)(struct receiver *rx); /* enable receiving on the receiver */ + void (*rx_disable)(struct receiver *rx); /* disable receiving on the receiver */ + void (*rx_unbind)(struct receiver *rx); /* unbind the receiver, most often closing the FD */ + int (*rx_listening)(const struct receiver *rx); /* is the receiver listening ? 0=no, >0=OK, <0=unrecoverable */ + + /* default I/O handler */ + void (*default_iocb)(int fd); /* generic I/O handler (typically accept callback) */ + + uint flags; /* flags describing protocol support (PROTO_F_*) */ + uint nb_receivers; /* number of receivers (under proto_lock) */ + struct list receivers; /* list of receivers using this protocol (under proto_lock) */ + struct list list; /* list of registered protocols (under proto_lock) */ +}; + +#endif /* _HAPROXY_PROTOCOL_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/protocol.h b/include/haproxy/protocol.h new file mode 100644 index 0000000..828093d --- /dev/null +++ b/include/haproxy/protocol.h @@ -0,0 +1,111 @@ +/* + * include/haproxy/protocol.h + * This file declares generic protocol management primitives. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROTOCOL_H +#define _HAPROXY_PROTOCOL_H + +#include <sys/socket.h> +#include <haproxy/protocol-t.h> +#include <haproxy/thread.h> + +/* [AF][sock_dgram][ctrl_dgram] */ +extern struct protocol *__protocol_by_family[AF_CUST_MAX][PROTO_NUM_TYPES][2]; +__decl_thread(extern HA_SPINLOCK_T proto_lock); + +/* Registers the protocol <proto> */ +void protocol_register(struct protocol *proto); + +/* Unregisters the protocol <proto>. Note that all listeners must have + * previously been unbound. + */ +void protocol_unregister(struct protocol *proto); + +/* clears flag <flag> on all protocols. */ +void protocol_clrf_all(uint flag); + +/* sets flag <flag> on all protocols. */ +void protocol_setf_all(uint flag); + +/* Checks if protocol <proto> supports PROTO_F flag <flag>. Returns zero if not, + * non-zero if supported. It may return a cached value from a previous test, + * and may run live tests then update the proto's flags to cache a result. It's + * better to call it only if needed so that it doesn't result in modules being + * loaded in case of a live test. + */ +int protocol_supports_flag(struct protocol *proto, uint flag); + +/* binds all listeners of all registered protocols. Returns a composition + * of ERR_NONE, ERR_RETRYABLE, ERR_FATAL, ERR_ABORT. + */ +int protocol_bind_all(int verbose); + +/* unbinds all listeners of all registered protocols. They are also closed. + * This must be performed before calling exit() in order to get a chance to + * remove file-system based sockets and pipes. + * Returns a composition of ERR_NONE, ERR_RETRYABLE, ERR_FATAL. + */ +int protocol_unbind_all(void); + +/* stops all listeners of all registered protocols. This will normally catch + * every single listener, all protocols included. This is to be used during + * soft_stop() only. It does not return any error. + */ +void protocol_stop_now(void); + +/* pauses all listeners of all registered protocols. This is typically + * used on SIG_TTOU to release all listening sockets for the time needed to + * try to bind a new process. The listeners enter LI_PAUSED. It returns + * ERR_NONE, with ERR_FATAL on failure. + */ +int protocol_pause_all(void); + +/* resumes all listeners of all registered protocols. This is typically used on + * SIG_TTIN to re-enable listening sockets after a new process failed to bind. + * The listeners switch to LI_READY/LI_FULL. It returns ERR_NONE, with ERR_FATAL + * on failure. + */ +int protocol_resume_all(void); + +/* enables all listeners of all registered protocols. This is intended to be + * used after a fork() to enable reading on all file descriptors. Returns a + * composition of ERR_NONE, ERR_RETRYABLE, ERR_FATAL. + */ +int protocol_enable_all(void); + +/* returns the protocol associated to family <family> with proto_type among the + * supported protocol types, and ctrl_type of either SOCK_STREAM or SOCK_DGRAM + * depending on the requested values, or NULL if not found. + */ +static inline struct protocol *protocol_lookup(int family, enum proto_type proto_type, int ctrl_dgram) +{ + if (family >= 0 && family < AF_CUST_MAX) + return __protocol_by_family[family][proto_type][!!ctrl_dgram]; + return NULL; +} + +#endif /* _HAPROXY_PROTOCOL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/proxy-t.h b/include/haproxy/proxy-t.h new file mode 100644 index 0000000..2f7bf7b --- /dev/null +++ b/include/haproxy/proxy-t.h @@ -0,0 +1,547 @@ +/* + * include/haproxy/proxy-t.h + * This file defines everything related to proxies. + * + * Copyright (C) 2000-2011 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROXY_T_H +#define _HAPROXY_PROXY_T_H + +#include <sys/types.h> +#include <sys/socket.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/arg-t.h> +#include <haproxy/backend-t.h> +#include <haproxy/compression-t.h> +#include <haproxy/counters-t.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/queue-t.h> +#include <haproxy/server-t.h> +#include <haproxy/stats-t.h> +#include <haproxy/tcpcheck-t.h> +#include <haproxy/thread-t.h> +#include <haproxy/tools-t.h> +#include <haproxy/uri_auth-t.h> +#include <haproxy/http_ext-t.h> + +/* values for proxy->mode */ +enum pr_mode { + PR_MODE_TCP = 0, + PR_MODE_HTTP, + PR_MODE_CLI, + PR_MODE_SYSLOG, + PR_MODE_PEERS, + PR_MODES +} __attribute__((packed)); + +enum PR_SRV_STATE_FILE { + PR_SRV_STATE_FILE_UNSPEC = 0, + PR_SRV_STATE_FILE_NONE, + PR_SRV_STATE_FILE_GLOBAL, + PR_SRV_STATE_FILE_LOCAL, +}; + + +/* flag values for proxy->cap. This is a bitmask of capabilities supported by the proxy */ +#define PR_CAP_NONE 0x0000 +#define PR_CAP_FE 0x0001 +#define PR_CAP_BE 0x0002 +#define PR_CAP_LISTEN (PR_CAP_FE|PR_CAP_BE) +#define PR_CAP_DEF 0x0004 /* defaults section */ +#define PR_CAP_INT 0x0008 /* internal proxy (used by lua engine) */ +#define PR_CAP_LB 0x0010 /* load-balancing capabilities, i.e. listen/frontend/backend proxies */ +#define PR_CAP_HTTPCLIENT 0x0020 /* proxy used for httpclient */ + +/* bits for proxy->options */ +#define PR_O_REDISP 0x00000001 /* allow reconnection to dispatch in case of errors */ +#define PR_O_TRANSP 0x00000002 /* transparent mode : use original DEST as dispatch */ + +/* HTTP server-side reuse */ +#define PR_O_REUSE_NEVR 0x00000000 /* never reuse a shared connection */ +#define PR_O_REUSE_SAFE 0x00000004 /* only reuse a shared connection when it's safe to do so */ +#define PR_O_REUSE_AGGR 0x00000008 /* aggressively reuse a shared connection */ +#define PR_O_REUSE_ALWS 0x0000000C /* always reuse a shared connection */ +#define PR_O_REUSE_MASK 0x0000000C /* mask to retrieve shared connection preferences */ + +#define PR_O_IDLE_CLOSE_RESP 0x00000010 /* avoid closing idle connections during a soft stop */ +#define PR_O_PREF_LAST 0x00000020 /* prefer last server */ +#define PR_O_DISPATCH 0x00000040 /* use dispatch mode */ +#define PR_O_FORCED_ID 0x00000080 /* proxy's ID was forced in the configuration */ +/* unused: 0x00000100 */ +#define PR_O_IGNORE_PRB 0x00000200 /* ignore empty requests (aborts and timeouts) */ +#define PR_O_NULLNOLOG 0x00000400 /* a connect without request will not be logged */ +#define PR_O_WREQ_BODY 0x00000800 /* always wait for the HTTP request body */ +#define PR_O_HTTP_UPG 0x00001000 /* Contain a "switch-mode http" tcp-request rule */ +/* unused: 0x00002000 */ +#define PR_O_PERSIST 0x00004000 /* server persistence stays effective even when server is down */ +#define PR_O_LOGASAP 0x00008000 /* log as soon as possible, without waiting for the stream to complete */ +#define PR_O_ERR_LOGFMT 0x00010000 /* use log-format for connection error message */ +#define PR_O_CHK_CACHE 0x00020000 /* require examination of cacheability of the 'set-cookie' field */ +#define PR_O_TCP_CLI_KA 0x00040000 /* enable TCP keep-alive on client-side streams */ +#define PR_O_TCP_SRV_KA 0x00080000 /* enable TCP keep-alive on server-side streams */ +#define PR_O_USE_ALL_BK 0x00100000 /* load-balance between backup servers */ +/* unused: 0x00200000 */ +#define PR_O_TCP_NOLING 0x00400000 /* disable lingering on client and server connections */ +#define PR_O_ABRT_CLOSE 0x00800000 /* immediately abort request when client closes */ + +#define PR_O_HTTP_KAL 0x00000000 /* HTTP keep-alive mode (http-keep-alive) */ +#define PR_O_HTTP_CLO 0x01000000 /* HTTP close mode (httpclose) */ +#define PR_O_HTTP_SCL 0x02000000 /* HTTP server close mode (http-server-close) */ +#define PR_O_HTTP_MODE 0x03000000 /* MASK to retrieve the HTTP mode */ +/* unused: 0x04000000 */ + +#define PR_O_TCPCHK_SSL 0x08000000 /* at least one TCPCHECK connect rule requires SSL */ +#define PR_O_CONTSTATS 0x10000000 /* continuous counters */ +/* unused: 0x20000000 */ +#define PR_O_DISABLE404 0x40000000 /* Disable a server on a 404 response to a health-check */ +/* unused: 0x80000000 */ + +/* bits for proxy->options2 */ +#define PR_O2_SPLIC_REQ 0x00000001 /* transfer requests using linux kernel's splice() */ +#define PR_O2_SPLIC_RTR 0x00000002 /* transfer responses using linux kernel's splice() */ +#define PR_O2_SPLIC_AUT 0x00000004 /* automatically use linux kernel's splice() */ +#define PR_O2_SPLIC_ANY (PR_O2_SPLIC_REQ|PR_O2_SPLIC_RTR|PR_O2_SPLIC_AUT) +#define PR_O2_REQBUG_OK 0x00000008 /* let buggy requests pass through */ +#define PR_O2_RSPBUG_OK 0x00000010 /* let buggy responses pass through */ +#define PR_O2_NOLOGNORM 0x00000020 /* don't log normal traffic, only errors and retries */ +#define PR_O2_LOGERRORS 0x00000040 /* log errors and retries at level LOG_ERR */ +#define PR_O2_SMARTACC 0x00000080 /* don't immediately ACK request after accept */ +#define PR_O2_SMARTCON 0x00000100 /* don't immediately send empty ACK after connect */ +#define PR_O2_RDPC_PRST 0x00000200 /* Activate rdp cookie analyser */ +#define PR_O2_CLFLOG 0x00000400 /* log into clf format */ +#define PR_O2_LOGHCHKS 0x00000800 /* log health checks */ +#define PR_O2_INDEPSTR 0x00001000 /* independent streams, don't update rex on write */ +#define PR_O2_SOCKSTAT 0x00002000 /* collect & provide separate statistics for sockets */ + +#define PR_O2_H1_ADJ_BUGCLI 0x00008000 /* adjust the case of h1 headers of the response for bogus clients */ +#define PR_O2_H1_ADJ_BUGSRV 0x00004000 /* adjust the case of h1 headers of the request for bogus servers */ +#define PR_O2_NO_H2_UPGRADE 0x00010000 /* disable the implicit H2 upgrades from H1 client connections */ + +#define PR_O2_NODELAY 0x00020000 /* fully interactive mode, never delay outgoing data */ +#define PR_O2_USE_PXHDR 0x00040000 /* use Proxy-Connection for proxy requests */ +#define PR_O2_CHK_SNDST 0x00080000 /* send the state of each server along with HTTP health checks */ + +#define PR_O2_SRC_ADDR 0x00100000 /* get the source ip and port for logs */ + +#define PR_O2_FAKE_KA 0x00200000 /* pretend we do keep-alive with server even though we close */ + +#define PR_O2_RSTRICT_REQ_HDR_NAMES_BLK 0x00400000 /* reject request with header names containing chars outside of [0-9a-zA-Z-] charset */ +#define PR_O2_RSTRICT_REQ_HDR_NAMES_DEL 0x00800000 /* remove request header names containing chars outside of [0-9a-zA-Z-] charset */ +#define PR_O2_RSTRICT_REQ_HDR_NAMES_NOOP 0x01000000 /* preserve request header names containing chars outside of [0-9a-zA-Z-] charset */ +#define PR_O2_RSTRICT_REQ_HDR_NAMES_MASK 0x01c00000 /* mask for restrict-http-header-names option */ +/* unused : 0x0000000..0x80000000 */ + +/* server health checks */ +#define PR_O2_CHK_NONE 0x00000000 /* no L7 health checks configured (TCP by default) */ +#define PR_O2_TCPCHK_CHK 0x90000000 /* use TCPCHK check for server health */ +#define PR_O2_EXT_CHK 0xA0000000 /* use external command for server health */ +/* unused: 0xB0000000 to 0xF000000, reserved for health checks */ +#define PR_O2_CHK_ANY 0xF0000000 /* Mask to cover any check */ +/* end of proxy->options2 */ + +/* Cookie settings for pr->ck_opts */ +#define PR_CK_RW 0x00000001 /* rewrite all direct cookies with the right serverid */ +#define PR_CK_IND 0x00000002 /* keep only indirect cookies */ +#define PR_CK_INS 0x00000004 /* insert cookies when not accessing a server directly */ +#define PR_CK_PFX 0x00000008 /* rewrite all cookies by prefixing the right serverid */ +#define PR_CK_ANY (PR_CK_RW | PR_CK_IND | PR_CK_INS | PR_CK_PFX) +#define PR_CK_NOC 0x00000010 /* add a 'Cache-control' header with the cookie */ +#define PR_CK_POST 0x00000020 /* don't insert cookies for requests other than a POST */ +#define PR_CK_PSV 0x00000040 /* cookie ... preserve */ +#define PR_CK_HTTPONLY 0x00000080 /* emit the "HttpOnly" attribute */ +#define PR_CK_SECURE 0x00000100 /* emit the "Secure" attribute */ +#define PR_CK_DYNAMIC 0x00000200 /* create dynamic cookies for each server */ + +/* bits for sticking rules */ +#define STK_IS_MATCH 0x00000001 /* match on request fetch */ +#define STK_IS_STORE 0x00000002 /* store on request fetch */ +#define STK_ON_RSP 0x00000004 /* store on response fetch */ + +/* diff bits for proxy_find_best_match */ +#define PR_FBM_MISMATCH_ID 0x01 +#define PR_FBM_MISMATCH_NAME 0x02 +#define PR_FBM_MISMATCH_PROXYTYPE 0x04 + +/* Bits for the different retry causes */ +#define PR_RE_CONN_FAILED 0x00000001 /* Retry if we failed to connect */ +#define PR_RE_DISCONNECTED 0x00000002 /* Retry if we got disconnected with no answer */ +#define PR_RE_TIMEOUT 0x00000004 /* Retry if we got a server timeout before we got any data */ +#define PR_RE_401 0x00000008 /* Retry if we got a 401 */ +#define PR_RE_403 0x00000010 /* Retry if we got a 403 */ +#define PR_RE_404 0x00000020 /* Retry if we got a 404 */ +#define PR_RE_408 0x00000040 /* Retry if we got a 408 */ +#define PR_RE_425 0x00000080 /* Retry if we got a 425 */ +#define PR_RE_500 0x00000100 /* Retry if we got a 500 */ +#define PR_RE_501 0x00000200 /* Retry if we got a 501 */ +#define PR_RE_502 0x00000400 /* Retry if we got a 502 */ +#define PR_RE_503 0x00000800 /* Retry if we got a 503 */ +#define PR_RE_504 0x00001000 /* Retry if we got a 504 */ +#define PR_RE_STATUS_MASK (PR_RE_401 | PR_RE_403 | PR_RE_404 | \ + PR_RE_408 | PR_RE_425 | PR_RE_500 | \ + PR_RE_501 | PR_RE_502 | PR_RE_503 | \ + PR_RE_504) +/* 0x00000800, 0x00001000, 0x00002000, 0x00004000 and 0x00008000 unused, + * reserved for eventual future status codes + */ +#define PR_RE_EARLY_ERROR 0x00010000 /* Retry if we failed at sending early data */ +#define PR_RE_JUNK_REQUEST 0x00020000 /* We received an incomplete or garbage response */ + +/* Proxy flags */ +#define PR_FL_DISABLED 0x01 /* The proxy was disabled in the configuration (not at runtime) */ +#define PR_FL_STOPPED 0x02 /* The proxy was stopped */ +#define PR_FL_READY 0x04 /* The proxy is ready to be used (initialized and configured) */ +#define PR_FL_EXPLICIT_REF 0x08 /* The default proxy is explicitly referenced by another proxy */ +#define PR_FL_IMPLICIT_REF 0x10 /* The default proxy is implicitly referenced by another proxy */ +#define PR_FL_PAUSED 0x20 /* The proxy was paused at run time (reversible) */ + +struct stream; + +struct http_snapshot { + unsigned int sid; /* ID of the faulty stream */ + unsigned int state; /* message state before the error (when saved) */ + unsigned int b_flags; /* buffer flags */ + unsigned int s_flags; /* stream flags */ + + unsigned int t_flags; /* transaction flags */ + unsigned int m_flags; /* message flags */ + unsigned long long m_clen; /* chunk len for this message */ + unsigned long long m_blen; /* body len for this message */ +}; + +struct h1_snapshot { + unsigned int state; /* H1 message state when the error occurred */ + unsigned int c_flags; /* H1 connection flags */ + unsigned int s_flags; /* H1 stream flags */ + unsigned int m_flags; /* H1 message flags */ + unsigned long long m_clen; /* chunk len for this message */ + unsigned long long m_blen; /* body len for this message */ +}; + +union error_snapshot_ctx { + struct http_snapshot http; + struct h1_snapshot h1; +}; + +struct error_snapshot { + /**** common part ****/ + struct timeval when; /* date of this event, (tv_sec == 0) means "never" */ + /* @16 */ + void (*show)(struct buffer *, const struct error_snapshot *); /* dump function */ + unsigned long long buf_ofs; /* relative position of the buffer's input inside its container */ + /* @32 */ + unsigned int buf_out; /* pending output bytes _before_ the buffer's input (0..buf->data-1) */ + unsigned int buf_len; /* original length of the last invalid request/response (0..buf->data-1-buf_out) */ + unsigned int buf_err; /* buffer-relative position where the error was detected (0..len-1) */ + unsigned int buf_wrap; /* buffer-relative position where the buffer is expected to wrap (1..buf_size) */ + /* @48 */ + struct proxy *oe; /* other end = frontend or backend involved */ + struct server *srv; /* server associated with the error (or NULL) */ + /* @64 */ + unsigned int ev_id; /* event number (counter incremented for each capture) */ + /* @68: 4 bytes hole here */ + struct sockaddr_storage src; /* client's address */ + + /**** protocol-specific part ****/ + union error_snapshot_ctx ctx; + char buf[VAR_ARRAY]; /* copy of the beginning of the message for bufsize bytes */ +}; + +struct proxy { + enum obj_type obj_type; /* object type == OBJ_TYPE_PROXY */ + char flags; /* bit field PR_FL_* */ + enum pr_mode mode; /* mode = PR_MODE_TCP, PR_MODE_HTTP, ... */ + char cap; /* supported capabilities (PR_CAP_*) */ + unsigned int maxconn; /* max # of active streams on the frontend */ + + int options; /* PR_O_REDISP, PR_O_TRANSP, ... */ + int options2; /* PR_O2_* */ + unsigned int ck_opts; /* PR_CK_* (cookie options) */ + unsigned int fe_req_ana, be_req_ana; /* bitmap of common request protocol analysers for the frontend and backend */ + unsigned int fe_rsp_ana, be_rsp_ana; /* bitmap of common response protocol analysers for the frontend and backend */ + unsigned int http_needed; /* non-null if HTTP analyser may be used */ + union { + struct proxy *be; /* default backend, or NULL if none set */ + char *name; /* default backend name during config parse */ + } defbe; + struct proxy *defpx; /* default proxy used to init this one (may be NULL) */ + struct list acl; /* ACL declared on this proxy */ + struct list http_req_rules; /* HTTP request rules: allow/deny/... */ + struct list http_res_rules; /* HTTP response rules: allow/deny/... */ + struct list http_after_res_rules; /* HTTP final response rules: set-header/del-header/... */ + struct list redirect_rules; /* content redirecting rules (chained) */ + struct list switching_rules; /* content switching rules (chained) */ + struct list persist_rules; /* 'force-persist' and 'ignore-persist' rules (chained) */ + struct list sticking_rules; /* content sticking rules (chained) */ + struct list storersp_rules; /* content store response rules (chained) */ + struct list server_rules; /* server switching rules (chained) */ + struct { /* TCP request processing */ + unsigned int inspect_delay; /* inspection delay */ + struct list inspect_rules; /* inspection rules */ + struct list l4_rules; /* layer4 rules */ + struct list l5_rules; /* layer5 rules */ + } tcp_req; + struct { /* TCP request processing */ + unsigned int inspect_delay; /* inspection delay */ + struct list inspect_rules; /* inspection rules */ + } tcp_rep; + struct server *srv, defsrv; /* known servers; default server configuration */ + struct lbprm lbprm; /* load-balancing parameters */ + int srv_act, srv_bck; /* # of servers eligible for LB (UP|!checked) AND (enabled+weight!=0) */ + int served; /* # of active sessions currently being served */ + int cookie_len; /* strlen(cookie_name), computed only once */ + char *cookie_domain; /* domain used to insert the cookie */ + char *cookie_name; /* name of the cookie to look for */ + char *cookie_attrs; /* list of attributes to add to the cookie */ + char *dyncookie_key; /* Secret key used to generate dynamic persistent cookies */ + unsigned int cookie_maxidle; /* max idle time for this cookie */ + unsigned int cookie_maxlife; /* max life time for this cookie */ + char *rdp_cookie_name; /* name of the RDP cookie to look for */ + char *capture_name; /* beginning of the name of the cookie to capture */ + int rdp_cookie_len; /* strlen(rdp_cookie_name), computed only once */ + int capture_namelen; /* length of the cookie name to match */ + struct uri_auth *uri_auth; /* if non-NULL, the (list of) per-URI authentications */ + int capture_len; /* length of the string to be captured */ + int max_out_conns; /* Max number of idling connections we keep for a session */ + int max_ka_queue; /* 1+maximum requests in queue accepted for reusing a K-A conn (0=none) */ + int clitcpka_cnt; /* The maximum number of keepalive probes TCP should send before dropping the connection. (client side) */ + int clitcpka_idle; /* The time (in seconds) the connection needs to remain idle before TCP starts sending keepalive probes. (client side) */ + int clitcpka_intvl; /* The time (in seconds) between individual keepalive probes. (client side) */ + int srvtcpka_cnt; /* The maximum number of keepalive probes TCP should send before dropping the connection. (server side) */ + int srvtcpka_idle; /* The time (in seconds) the connection needs to remain idle before TCP starts sending keepalive probes. (server side) */ + int srvtcpka_intvl; /* The time (in seconds) between individual keepalive probes. (server side) */ + struct ist monitor_uri; /* a special URI to which we respond with HTTP/200 OK */ + struct list mon_fail_cond; /* list of conditions to fail monitoring requests (chained) */ + struct { /* WARNING! check proxy_reset_timeouts() in proxy.h !!! */ + int client; /* client I/O timeout (in ticks) */ + int tarpit; /* tarpit timeout, defaults to connect if unspecified */ + int queue; /* queue timeout, defaults to connect if unspecified */ + int connect; /* connect timeout (in ticks) */ + int server; /* server I/O timeout (in ticks) */ + int client_hs; /* maximum time for client handshake completion */ + int httpreq; /* maximum time for complete HTTP request */ + int httpka; /* maximum time for a new HTTP request when using keep-alive */ + int check; /* maximum time for complete check */ + int tunnel; /* I/O timeout to use in tunnel mode (in ticks) */ + int clientfin; /* timeout to apply to client half-closed connections */ + int serverfin; /* timeout to apply to server half-closed connections */ + } timeout; + __decl_thread(HA_RWLOCK_T lock); /* may be taken under the server's lock */ + + char *id, *desc; /* proxy id (name) and description */ + struct queue queue; /* queued requests (pendconns) */ + int totpend; /* total number of pending connections on this instance (for stats) */ + unsigned int feconn, beconn; /* # of active frontend and backends streams */ + struct freq_ctr fe_req_per_sec; /* HTTP requests per second on the frontend */ + struct freq_ctr fe_conn_per_sec; /* received connections per second on the frontend */ + struct freq_ctr fe_sess_per_sec; /* accepted sessions per second on the frontend (after tcp rules) */ + struct freq_ctr be_sess_per_sec; /* sessions per second on the backend */ + unsigned int fe_sps_lim; /* limit on new sessions per second on the frontend */ + unsigned int fullconn; /* #conns on backend above which servers are used at full load */ + unsigned int tot_fe_maxconn; /* #maxconn of frontends linked to that backend, it is used to compute fullconn */ + struct ist server_id_hdr_name; /* the header to use to send the server id (name) */ + int conn_retries; /* maximum number of connect retries */ + unsigned int retry_type; /* Type of retry allowed */ + int redispatch_after; /* number of retries before redispatch */ + unsigned down_trans; /* up-down transitions */ + unsigned down_time; /* total time the proxy was down */ + time_t last_change; /* last time, when the state was changed */ + int (*accept)(struct stream *s); /* application layer's accept() */ + struct conn_src conn_src; /* connection source settings */ + enum obj_type *default_target; /* default target to use for accepted streams or NULL */ + struct proxy *next; + struct proxy *next_stkt_ref; /* Link to the list of proxies which refer to the same stick-table. */ + + struct list loggers; /* one per 'log' directive */ + struct list logformat; /* log_format linked list */ + struct list logformat_sd; /* log_format linked list for the RFC5424 structured-data part */ + struct list logformat_error; /* log_format linked list used in case of connection error on the frontend */ + struct buffer log_tag; /* override default syslog tag */ + struct ist header_unique_id; /* unique-id header */ + struct list format_unique_id; /* unique-id format */ + int to_log; /* things to be logged (LW_*) */ + int nb_req_cap, nb_rsp_cap; /* # of headers to be captured */ + struct cap_hdr *req_cap; /* chained list of request headers to be captured */ + struct cap_hdr *rsp_cap; /* chained list of response headers to be captured */ + struct pool_head *req_cap_pool, /* pools of pre-allocated char ** used to build the streams */ + *rsp_cap_pool; + struct be_counters be_counters; /* backend statistics counters */ + struct fe_counters fe_counters; /* frontend statistics counters */ + + struct mt_list listener_queue; /* list of the temporarily limited listeners because of lack of a proxy resource */ + struct stktable *table; /* table for storing sticking streams */ + + struct task *task; /* the associated task, mandatory to manage rate limiting, stopping and resource shortage, NULL if disabled */ + struct tcpcheck_rules tcpcheck_rules; /* tcp-check send / expect rules */ + char *check_command; /* Command to use for external agent checks */ + char *check_path; /* PATH environment to use for external agent checks */ + struct http_reply *replies[HTTP_ERR_SIZE]; /* HTTP replies for known errors */ + unsigned int log_count; /* number of logs produced by the frontend */ + int uuid; /* universally unique proxy ID, used for SNMP */ + unsigned int backlog; /* force the frontend's listen backlog */ + unsigned int li_all; /* total number of listeners attached to this proxy */ + unsigned int li_paused; /* total number of listeners paused (LI_PAUSED) */ + unsigned int li_bound; /* total number of listeners ready (LI_LISTEN) */ + unsigned int li_ready; /* total number of listeners ready (>=LI_READY) */ + unsigned int li_suspended; /* total number of listeners suspended (could be paused or unbound) */ + + /* warning: these structs are huge, keep them at the bottom */ + struct sockaddr_storage dispatch_addr; /* the default address to connect to */ + struct error_snapshot *invalid_req, *invalid_rep; /* captures of last errors */ + + /* used only during configuration parsing */ + int no_options; /* PR_O_REDISP, PR_O_TRANSP, ... */ + int no_options2; /* PR_O2_* */ + + struct { + char *file; /* file where the section appears */ + struct eb32_node id; /* place in the tree of used IDs */ + int line; /* line where the section appears */ + struct eb_root used_listener_id;/* list of listener IDs in use */ + struct eb_root used_server_id; /* list of server IDs in use */ + struct eb_root used_server_name; /* list of server names in use */ + struct list bind; /* list of bind settings */ + struct list listeners; /* list of listeners belonging to this frontend */ + struct list errors; /* list of all custom error files */ + struct arg_list args; /* sample arg list that need to be resolved */ + unsigned int refcount; /* refcount on this proxy (only used for default proxy for now) */ + struct ebpt_node by_name; /* proxies are stored sorted by name here */ + char *logformat_string; /* log format string */ + char *lfs_file; /* file name where the logformat string appears (strdup) */ + int lfs_line; /* file name where the logformat string appears */ + int uif_line; /* file name where the unique-id-format string appears */ + char *uif_file; /* file name where the unique-id-format string appears (strdup) */ + char *uniqueid_format_string; /* unique-id format string */ + char *logformat_sd_string; /* log format string for the RFC5424 structured-data part */ + char *lfsd_file; /* file name where the structured-data logformat string for RFC5424 appears (strdup) */ + int lfsd_line; /* file name where the structured-data logformat string for RFC5424 appears */ + char *error_logformat_string; + char *elfs_file; + int elfs_line; + } conf; /* config information */ + struct http_ext *http_ext; /* http ext options */ + struct eb_root used_server_addr; /* list of server addresses in use */ + void *parent; /* parent of the proxy when applicable */ + struct comp *comp; /* http compression */ + + struct { + union { + struct mailers *m; /* Mailer to send email alerts via */ + char *name; + } mailers; + char *from; /* Address to send email alerts from */ + char *to; /* Address(es) to send email alerts to */ + char *myhostname; /* Identity to use in HELO command sent to mailer */ + int level; /* Maximum syslog level of messages to send + * email alerts for */ + int set; /* True if email_alert settings are present */ + struct email_alertq *queues; /* per-mailer alerts queues */ + } email_alert; + + int load_server_state_from_file; /* location of the file containing server state. + * flag PR_SRV_STATE_FILE_* */ + char *server_state_file_name; /* used when load_server_state_from_file is set to + * PR_SRV_STATE_FILE_LOCAL. Give a specific file name for + * this backend. If not specified or void, then the backend + * name is used + */ + struct list filter_configs; /* list of the filters that are declared on this proxy */ + + EXTRA_COUNTERS(extra_counters_fe); + EXTRA_COUNTERS(extra_counters_be); +}; + +struct switching_rule { + struct list list; /* list linked to from the proxy */ + struct acl_cond *cond; /* acl condition to meet */ + int dynamic; /* this is a dynamic rule using the logformat expression */ + union { + struct proxy *backend; /* target backend */ + char *name; /* target backend name during config parsing */ + struct list expr; /* logformat expression to use for dynamic rules */ + } be; + char *file; + int line; +}; + +struct server_rule { + struct list list; /* list linked to from the proxy */ + struct acl_cond *cond; /* acl condition to meet */ + int dynamic; + union { + struct server *ptr; /* target server */ + char *name; /* target server name during config parsing */ + } srv; + struct list expr; /* logformat expression to use for dynamic rules */ + char *file; + int line; +}; + +struct persist_rule { + struct list list; /* list linked to from the proxy */ + struct acl_cond *cond; /* acl condition to meet */ + int type; +}; + +struct sticking_rule { + struct list list; /* list linked to from the proxy */ + struct acl_cond *cond; /* acl condition to meet */ + struct sample_expr *expr; /* fetch expr to fetch key */ + int flags; /* STK_* */ + union { + struct stktable *t; /* target table */ + char *name; /* target table name during config parsing */ + } table; +}; + + +struct redirect_rule { + struct list list; /* list linked to from the proxy */ + struct acl_cond *cond; /* acl condition to meet */ + int type; + int rdr_len; + char *rdr_str; + struct list rdr_fmt; + int code; + unsigned int flags; + int cookie_len; + char *cookie_str; +}; + +/* some of the most common options which are also the easiest to handle */ +struct cfg_opt { + const char *name; + unsigned int val; + unsigned int cap; + unsigned int checks; + unsigned int mode; +}; + +#endif /* _HAPROXY_PROXY_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/proxy.h b/include/haproxy/proxy.h new file mode 100644 index 0000000..efdfa21 --- /dev/null +++ b/include/haproxy/proxy.h @@ -0,0 +1,264 @@ +/* + * include/haproxy/proxy.h + * This file defines function prototypes for proxy management. + * + * Copyright (C) 2000-2011 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_PROXY_H +#define _HAPROXY_PROXY_H + +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/list.h> +#include <haproxy/listener-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/server-t.h> +#include <haproxy/ticks.h> +#include <haproxy/thread.h> + +extern struct proxy *proxies_list; +extern struct eb_root used_proxy_id; /* list of proxy IDs in use */ +extern unsigned int error_snapshot_id; /* global ID assigned to each error then incremented */ +extern struct eb_root proxy_by_name; /* tree of proxies sorted by name */ + +extern const struct cfg_opt cfg_opts[]; +extern const struct cfg_opt cfg_opts2[]; + +struct task *manage_proxy(struct task *t, void *context, unsigned int state); +void proxy_cond_pause(struct proxy *p); +void proxy_cond_resume(struct proxy *p); +void proxy_cond_disable(struct proxy *p); +void soft_stop(void); +int pause_proxy(struct proxy *p); +int resume_proxy(struct proxy *p); +void stop_proxy(struct proxy *p); +int stream_set_backend(struct stream *s, struct proxy *be); + +void free_proxy(struct proxy *p); +const char *proxy_cap_str(int cap); +const char *proxy_mode_str(int mode); +const char *proxy_find_best_option(const char *word, const char **extra); +void proxy_store_name(struct proxy *px); +struct proxy *proxy_find_by_id(int id, int cap, int table); +struct proxy *proxy_find_by_name(const char *name, int cap, int table); +struct proxy *proxy_find_best_match(int cap, const char *name, int id, int *diff); +struct server *findserver(const struct proxy *px, const char *name); +struct server *findserver_unique_id(const struct proxy *px, int puid, uint32_t rid); +struct server *findserver_unique_name(const struct proxy *px, const char *name, uint32_t rid); +int proxy_cfg_ensure_no_http(struct proxy *curproxy); +int proxy_cfg_ensure_no_log(struct proxy *curproxy); +void init_new_proxy(struct proxy *p); +void proxy_preset_defaults(struct proxy *defproxy); +void proxy_free_defaults(struct proxy *defproxy); +void proxy_destroy_defaults(struct proxy *px); +void proxy_destroy_all_unref_defaults(void); +void proxy_ref_defaults(struct proxy *px, struct proxy *defpx); +void proxy_unref_defaults(struct proxy *px); +struct proxy *alloc_new_proxy(const char *name, unsigned int cap, + char **errmsg); +struct proxy *parse_new_proxy(const char *name, unsigned int cap, + const char *file, int linenum, + const struct proxy *defproxy); +void proxy_capture_error(struct proxy *proxy, int is_back, + struct proxy *other_end, enum obj_type *target, + const struct session *sess, + const struct buffer *buf, long buf_ofs, + unsigned int buf_out, unsigned int err_pos, + const union error_snapshot_ctx *ctx, + void (*show)(struct buffer *, const struct error_snapshot *)); +void proxy_adjust_all_maxconn(void); +struct proxy *cli_find_frontend(struct appctx *appctx, const char *arg); +struct proxy *cli_find_frontend(struct appctx *appctx, const char *arg); +int resolve_stick_rule(struct proxy *curproxy, struct sticking_rule *mrule); +void free_stick_rules(struct list *rules); +void free_server_rules(struct list *srules); + +/* + * This function returns a string containing the type of the proxy in a format + * suitable for error messages, from its capabilities. + */ +static inline const char *proxy_type_str(struct proxy *proxy) +{ + if (proxy->mode == PR_MODE_PEERS) + return "peers section"; + return proxy_cap_str(proxy->cap); +} + +/* Find the frontend having name <name>. The name may also start with a '#' to + * reference a numeric id. NULL is returned if not found. + */ +static inline struct proxy *proxy_fe_by_name(const char *name) +{ + return proxy_find_by_name(name, PR_CAP_FE, 0); +} + +/* Find the backend having name <name>. The name may also start with a '#' to + * reference a numeric id. NULL is returned if not found. + */ +static inline struct proxy *proxy_be_by_name(const char *name) +{ + return proxy_find_by_name(name, PR_CAP_BE, 0); +} + +/* this function initializes all timeouts for proxy p */ +static inline void proxy_reset_timeouts(struct proxy *proxy) +{ + proxy->timeout.client = TICK_ETERNITY; + proxy->timeout.tarpit = TICK_ETERNITY; + proxy->timeout.queue = TICK_ETERNITY; + proxy->timeout.connect = TICK_ETERNITY; + proxy->timeout.server = TICK_ETERNITY; + proxy->timeout.httpreq = TICK_ETERNITY; + proxy->timeout.check = TICK_ETERNITY; + proxy->timeout.tunnel = TICK_ETERNITY; +} + +/* increase the number of cumulated connections received on the designated frontend */ +static inline void proxy_inc_fe_conn_ctr(struct listener *l, struct proxy *fe) +{ + _HA_ATOMIC_INC(&fe->fe_counters.cum_conn); + if (l && l->counters) + _HA_ATOMIC_INC(&l->counters->cum_conn); + HA_ATOMIC_UPDATE_MAX(&fe->fe_counters.cps_max, + update_freq_ctr(&fe->fe_conn_per_sec, 1)); +} + +/* increase the number of cumulated connections accepted by the designated frontend */ +static inline void proxy_inc_fe_sess_ctr(struct listener *l, struct proxy *fe) +{ + + _HA_ATOMIC_INC(&fe->fe_counters.cum_sess); + if (l && l->counters) + _HA_ATOMIC_INC(&l->counters->cum_sess); + HA_ATOMIC_UPDATE_MAX(&fe->fe_counters.sps_max, + update_freq_ctr(&fe->fe_sess_per_sec, 1)); +} + +/* increase the number of cumulated HTTP sessions on the designated frontend. + * <http_ver> must be the HTTP version for such requests. + */ +static inline void proxy_inc_fe_cum_sess_ver_ctr(struct listener *l, struct proxy *fe, + unsigned int http_ver) +{ + if (http_ver == 0 || + http_ver > sizeof(fe->fe_counters.cum_sess_ver) / sizeof(*fe->fe_counters.cum_sess_ver)) + return; + + _HA_ATOMIC_INC(&fe->fe_counters.cum_sess_ver[http_ver - 1]); + if (l && l->counters) + _HA_ATOMIC_INC(&l->counters->cum_sess_ver[http_ver - 1]); +} + +/* increase the number of cumulated connections on the designated backend */ +static inline void proxy_inc_be_ctr(struct proxy *be) +{ + _HA_ATOMIC_INC(&be->be_counters.cum_conn); + HA_ATOMIC_UPDATE_MAX(&be->be_counters.sps_max, + update_freq_ctr(&be->be_sess_per_sec, 1)); +} + +/* increase the number of cumulated requests on the designated frontend. + * <http_ver> must be the HTTP version for HTTP request. 0 may be provided + * for others requests. + */ +static inline void proxy_inc_fe_req_ctr(struct listener *l, struct proxy *fe, + unsigned int http_ver) +{ + if (http_ver >= sizeof(fe->fe_counters.p.http.cum_req) / sizeof(*fe->fe_counters.p.http.cum_req)) + return; + + _HA_ATOMIC_INC(&fe->fe_counters.p.http.cum_req[http_ver]); + if (l && l->counters) + _HA_ATOMIC_INC(&l->counters->p.http.cum_req[http_ver]); + HA_ATOMIC_UPDATE_MAX(&fe->fe_counters.p.http.rps_max, + update_freq_ctr(&fe->fe_req_per_sec, 1)); +} + +/* Returns non-zero if the proxy is configured to retry a request if we got that status, 0 otherwise */ +static inline int l7_status_match(struct proxy *p, int status) +{ + /* Just return 0 if no retry was configured for any status */ + if (!(p->retry_type & PR_RE_STATUS_MASK)) + return 0; + + switch (status) { + case 401: + return (p->retry_type & PR_RE_401); + case 403: + return (p->retry_type & PR_RE_403); + case 404: + return (p->retry_type & PR_RE_404); + case 408: + return (p->retry_type & PR_RE_408); + case 425: + return (p->retry_type & PR_RE_425); + case 500: + return (p->retry_type & PR_RE_500); + case 501: + return (p->retry_type & PR_RE_501); + case 502: + return (p->retry_type & PR_RE_502); + case 503: + return (p->retry_type & PR_RE_503); + case 504: + return (p->retry_type & PR_RE_504); + default: + break; + } + return 0; +} + +/* Return 1 if <p> proxy is in <list> list of proxies which are also stick-tables, + * 0 if not. + */ +static inline int in_proxies_list(struct proxy *list, struct proxy *proxy) +{ + struct proxy *p; + + for (p = list; p; p = p->next_stkt_ref) + if (proxy == p) + return 1; + + return 0; +} + +/* Add <bytes> to the global total bytes sent and adjust the send rate. Set + * <splice> if this was sent usigin splicing. + */ +static inline void increment_send_rate(uint64_t bytes, int splice) +{ + /* We count the total bytes sent, and the send rate for 32-byte blocks. + * The reason for the latter is that freq_ctr are limited to 4GB and + * that it's not enough per second. + */ + + if (splice) + _HA_ATOMIC_ADD(&th_ctx->spliced_out_bytes, bytes); + _HA_ATOMIC_ADD(&th_ctx->out_bytes, bytes); + update_freq_ctr(&th_ctx->out_32bps, (bytes + 16) / 32); +} + +#endif /* _HAPROXY_PROXY_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/qmux_http.h b/include/haproxy/qmux_http.h new file mode 100644 index 0000000..a7dbe7c --- /dev/null +++ b/include/haproxy/qmux_http.h @@ -0,0 +1,17 @@ +#ifndef _HAPROXY_MUX_QUIC_HTTP_H +#define _HAPROXY_MUX_QUIC_HTTP_H + +#ifdef USE_QUIC + +#include <haproxy/buf.h> +#include <haproxy/mux_quic.h> + +size_t qcs_http_rcv_buf(struct qcs *qcs, struct buffer *buf, size_t count, + char *fin); +size_t qcs_http_snd_buf(struct qcs *qcs, struct buffer *buf, size_t count, + char *fin); +size_t qcs_http_reset_buf(struct qcs *qcs, struct buffer *buf, size_t count); + +#endif /* USE_QUIC */ + +#endif /* _HAPROXY_MUX_QUIC_HTTP_H */ diff --git a/include/haproxy/qmux_trace.h b/include/haproxy/qmux_trace.h new file mode 100644 index 0000000..49759a3 --- /dev/null +++ b/include/haproxy/qmux_trace.h @@ -0,0 +1,73 @@ +#ifndef _HAPROXY_QMUX_TRACE_H +#define _HAPROXY_QMUX_TRACE_H + +#ifdef USE_QUIC + +#include <haproxy/api-t.h> +#include <haproxy/trace.h> + +extern struct trace_source trace_qmux; +#define TRACE_SOURCE &trace_qmux + +static const struct trace_event qmux_trace_events[] = { +#define QMUX_EV_QCC_NEW (1ULL << 0) + { .mask = QMUX_EV_QCC_NEW , .name = "qcc_new", .desc = "new QUIC connection" }, +#define QMUX_EV_QCC_RECV (1ULL << 1) + { .mask = QMUX_EV_QCC_RECV, .name = "qcc_recv", .desc = "Rx on QUIC connection" }, +#define QMUX_EV_QCC_SEND (1ULL << 2) + { .mask = QMUX_EV_QCC_SEND, .name = "qcc_send", .desc = "Tx on QUIC connection" }, +#define QMUX_EV_QCC_WAKE (1ULL << 3) + { .mask = QMUX_EV_QCC_WAKE, .name = "qcc_wake", .desc = "QUIC connection woken up" }, +#define QMUX_EV_QCC_END (1ULL << 4) + { .mask = QMUX_EV_QCC_END, .name = "qcc_end", .desc = "QUIC connection terminated" }, +#define QMUX_EV_QCC_NQCS (1ULL << 5) + { .mask = QMUX_EV_QCC_NQCS, .name = "qcc_no_qcs", .desc = "QUIC stream not found" }, +#define QMUX_EV_QCS_NEW (1ULL << 6) + { .mask = QMUX_EV_QCS_NEW, .name = "qcs_new", .desc = "new QUIC stream" }, +#define QMUX_EV_QCS_RECV (1ULL << 7) + { .mask = QMUX_EV_QCS_RECV, .name = "qcs_recv", .desc = "Rx on QUIC stream" }, +#define QMUX_EV_QCS_SEND (1ULL << 8) + { .mask = QMUX_EV_QCS_SEND, .name = "qcs_send", .desc = "Tx on QUIC stream" }, +#define QMUX_EV_QCS_END (1ULL << 9) + { .mask = QMUX_EV_QCS_END, .name = "qcs_end", .desc = "QUIC stream terminated" }, +#define QMUX_EV_STRM_RECV (1ULL << 10) + { .mask = QMUX_EV_STRM_RECV, .name = "strm_recv", .desc = "receiving data for stream" }, +#define QMUX_EV_STRM_SEND (1ULL << 11) + { .mask = QMUX_EV_STRM_SEND, .name = "strm_send", .desc = "sending data for stream" }, +#define QMUX_EV_STRM_WAKE (1ULL << 12) + { .mask = QMUX_EV_STRM_WAKE, .name = "strm_wake", .desc = "stream woken up" }, +#define QMUX_EV_STRM_SHUT (1ULL << 13) + { .mask = QMUX_EV_STRM_SHUT, .name = "strm_shut", .desc = "stream shutdown" }, +#define QMUX_EV_STRM_END (1ULL << 14) + { .mask = QMUX_EV_STRM_END, .name = "strm_end", .desc = "detaching app-layer stream" }, +#define QMUX_EV_SEND_FRM (1ULL << 15) + { .mask = QMUX_EV_SEND_FRM, .name = "send_frm", .desc = "sending QUIC frame" }, +/* special event dedicated to qcs_xfer_data */ +#define QMUX_EV_QCS_XFER_DATA (1ULL << 16) + { .mask = QMUX_EV_QCS_XFER_DATA, .name = "qcs_xfer_data", .desc = "qcs_xfer_data" }, +/* special event dedicated to qcs_build_stream_frm */ +#define QMUX_EV_QCS_BUILD_STRM (1ULL << 17) + { .mask = QMUX_EV_QCS_BUILD_STRM, .name = "qcs_build_stream_frm", .desc = "qcs_build_stream_frm" }, +#define QMUX_EV_PROTO_ERR (1ULL << 18) + { .mask = QMUX_EV_PROTO_ERR, .name = "proto_err", .desc = "protocol error" }, +#define QMUX_EV_QCC_ERR (1ULL << 19) + { .mask = QMUX_EV_QCC_ERR, .name = "qcc_err", .desc = "connection on error" }, + { } +}; + +/* custom arg for QMUX_EV_QCS_XFER_DATA */ +struct qcs_xfer_data_trace_arg { + size_t prep; + int xfer; +}; + +/* custom arg for QMUX_EV_QCS_BUILD_STRM */ +struct qcs_build_stream_trace_arg { + size_t len; + char fin; + uint64_t offset; +}; + +#endif /* USE_QUIC */ + +#endif /* _HAPROXY_QMUX_TRACE_H */ diff --git a/include/haproxy/qpack-dec.h b/include/haproxy/qpack-dec.h new file mode 100644 index 0000000..993f450 --- /dev/null +++ b/include/haproxy/qpack-dec.h @@ -0,0 +1,51 @@ +/* + * QPACK decompressor + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QPACK_DEC_H +#define _HAPROXY_QPACK_DEC_H + +struct buffer; +struct http_hdr; + +/* Internal QPACK processing errors. + *Nothing to see with the RFC. + */ +enum { + QPACK_ERR_NONE = 0, /* no error */ + QPACK_ERR_RIC, /* cannot decode Required Insert Count prefix field */ + QPACK_ERR_DB, /* cannot decode Delta Base prefix field */ + QPACK_ERR_TRUNCATED, /* truncated stream */ + QPACK_ERR_HUFFMAN, /* huffman decoding error */ + QPACK_ERR_TOO_LARGE, /* decoded request/response is too large */ +}; + +struct qpack_dec { + /* Insert count */ + uint64_t ic; + /* Known received count */ + uint64_t krc; +}; + +int qpack_decode_fs(const unsigned char *buf, uint64_t len, struct buffer *tmp, + struct http_hdr *list, int list_size); +int qpack_decode_enc(struct buffer *buf, int fin, void *ctx); +int qpack_decode_dec(struct buffer *buf, int fin, void *ctx); + +#endif /* _HAPROXY_QPACK_DEC_H */ diff --git a/include/haproxy/qpack-enc.h b/include/haproxy/qpack-enc.h new file mode 100644 index 0000000..0126937 --- /dev/null +++ b/include/haproxy/qpack-enc.h @@ -0,0 +1,12 @@ +#ifndef QPACK_ENC_H_ +#define QPACK_ENC_H_ + +#include <haproxy/istbuf.h> + +struct buffer; + +int qpack_encode_field_section_line(struct buffer *out); +int qpack_encode_int_status(struct buffer *out, unsigned int status); +int qpack_encode_header(struct buffer *out, const struct ist n, const struct ist v); + +#endif /* QPACK_ENC_H_ */ diff --git a/include/haproxy/qpack-t.h b/include/haproxy/qpack-t.h new file mode 100644 index 0000000..0e1736a --- /dev/null +++ b/include/haproxy/qpack-t.h @@ -0,0 +1,47 @@ +/* + * include/haproxy/qpack-t.h + * This file contains types for QPACK + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QPACK_T_H +#define _HAPROXY_QPACK_T_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +/* Encoder */ +/* Instruction bitmask */ +#define QPACK_ENC_INST_BITMASK 0xf0 +/* Instructions */ +#define QPACK_ENC_INST_DUP 0x00 // Duplicate +#define QPACK_ENC_INST_SDTC_BIT 0x20 // Set Dynamic Table Capacity +#define QPACK_ENC_INST_IWLN_BIT 0x40 // Insert With Literal Name +#define QPACK_ENC_INST_IWNR_BIT 0x80 // Insert With Name Reference + +/* Decoder */ +/* Instructions bitmask */ +#define QPACK_DEC_INST_BITMASK 0xf0 +/* Instructions */ +#define QPACK_DEC_INST_ICINC 0x00 // Insert Count Increment +#define QPACK_DEC_INST_SCCL 0x40 // Stream Cancellation +#define QPACK_DEC_INST_SACK 0x80 // Section Acknowledgment + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QPACK_T_H */ diff --git a/include/haproxy/qpack-tbl-t.h b/include/haproxy/qpack-tbl-t.h new file mode 100644 index 0000000..c27c623 --- /dev/null +++ b/include/haproxy/qpack-tbl-t.h @@ -0,0 +1,65 @@ +/* + * QPACK header table management (draft-ietf-quic-qpack-20) - type definitions + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_QPACK_TBL_T_H +#define _HAPROXY_QPACK_TBL_T_H + +/* + * Gcc before 3.0 needs [0] to declare a variable-size array + */ +#ifndef VAR_ARRAY +#if defined(__GNUC__) && (__GNUC__ < 3) +#define VAR_ARRAY 0 +#else +#define VAR_ARRAY +#endif +#endif + +/* One dynamic table entry descriptor */ +struct qpack_dte { + uint32_t addr; /* storage address, relative to the dte address */ + uint16_t nlen; /* header name length */ + uint16_t vlen; /* header value length */ +}; + +/* Note: the table's head plus a struct qpack_dte must be smaller than or equal to 32 + * bytes so that a single large header can always fit. Here that's 16 bytes for + * the header, plus 8 bytes per slot. + * Note that when <used> == 0, front, head, and wrap are undefined. + */ +struct qpack_dht { + uint32_t size; /* allocated table size in bytes */ + uint32_t total; /* sum of nlen + vlen in bytes */ + uint16_t front; /* slot number of the first node after the idx table */ + uint16_t wrap; /* number of allocated slots, wraps here */ + uint16_t head; /* last inserted slot number */ + uint16_t used; /* number of slots in use */ + struct qpack_dte dte[VAR_ARRAY]; /* dynamic table entries */ +}; + +/* static header table as in draft-ietf-quic-qpack-20 Appendix A. [0] unused. */ +#define QPACK_SHT_SIZE 99 + +#endif /* _HAPROXY_QPACK_TBL_T_H */ diff --git a/include/haproxy/qpack-tbl.h b/include/haproxy/qpack-tbl.h new file mode 100644 index 0000000..05f3ab4 --- /dev/null +++ b/include/haproxy/qpack-tbl.h @@ -0,0 +1,170 @@ +/* + * QPACK header table management - prototypes + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _HAPROXY_QPACK_TBL_H +#define _HAPROXY_QPACK_TBL_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/qpack-tbl-t.h> +#include <haproxy/http-hdr-t.h> + +/* when built outside of haproxy, QPACK_STANDALONE must be defined, and + * pool_head_qpack_tbl->size must be set to the DHT size. + */ +#ifndef QPACK_STANDALONE +#include <haproxy/pool.h> +#define qpack_alloc(pool) pool_alloc(pool) +#define qpack_free(pool, ptr) pool_free(pool, ptr) +#else +#include <stdlib.h> +#include <haproxy/pool-t.h> +#define qpack_alloc(pool) malloc(pool->size) +#define qpack_free(pool, ptr) free(ptr) +#endif + +extern const struct http_hdr qpack_sht[QPACK_SHT_SIZE]; +extern struct pool_head *pool_head_qpack_tbl; + +int __qpack_dht_make_room(struct qpack_dht *dht, unsigned int needed); +int qpack_dht_insert(struct qpack_dht *dht, struct ist name, struct ist value); + +#ifdef DEBUG_QPACK +void qpack_dht_dump(FILE *out, const struct qpack_dht *dht); +void qpack_dht_check_consistency(const struct qpack_dht *dht); +#endif + +/* return a pointer to the entry designated by index <idx> (starting at 0) or + * NULL if this index is not there. + */ +static inline const struct qpack_dte *qpack_get_dte(const struct qpack_dht *dht, uint16_t idx) +{ + if (idx >= dht->used) + return NULL; + + return &dht->dte[idx]; +} + +/* returns non-zero if <idx> is valid for table <dht> */ +static inline int qpack_valid_idx(const struct qpack_dht *dht, uint32_t idx) +{ + return idx < dht->used; +} + +/* return a pointer to the header name for entry <dte>. */ +static inline struct ist qpack_get_name(const struct qpack_dht *dht, const struct qpack_dte *dte) +{ + struct ist ret = { + .ptr = (void *)dht + dte->addr, + .len = dte->nlen, + }; + return ret; +} + +/* return a pointer to the header value for entry <dte>. */ +static inline struct ist qpack_get_value(const struct qpack_dht *dht, const struct qpack_dte *dte) +{ + struct ist ret = { + .ptr = (void *)dht + dte->addr + dte->nlen, + .len = dte->vlen, + }; + return ret; +} + +/* takes an idx, returns the associated name */ +static inline struct ist qpack_idx_to_name(const struct qpack_dht *dht, uint32_t idx) +{ + const struct qpack_dte *dte; + + dte = qpack_get_dte(dht, idx); + if (!dte) + return ist("### ERR ###"); // error + + return qpack_get_name(dht, dte); +} + +/* takes an idx, returns the associated value */ +static inline struct ist qpack_idx_to_value(const struct qpack_dht *dht, uint32_t idx) +{ + const struct qpack_dte *dte; + + dte = qpack_get_dte(dht, idx); + if (!dte) + return ist("### ERR ###"); // error + + return qpack_get_value(dht, dte); +} + +/* returns the slot number of the oldest entry (tail). Must not be used on an + * empty table. + */ +static inline unsigned int qpack_dht_get_tail(const struct qpack_dht *dht) +{ + return ((dht->head + 1U < dht->used) ? dht->wrap : 0) + dht->head + 1U - dht->used; +} + +/* Purges table dht until a header field of <needed> bytes fits according to + * the protocol (adding 32 bytes overhead). Returns non-zero on success, zero + * on failure (ie: table empty but still not sufficient). + */ +static inline int qpack_dht_make_room(struct qpack_dht *dht, unsigned int needed) +{ + if (dht->used * 32 + dht->total + needed + 32 <= dht->size) + return 1; + else if (!dht->used) + return 0; + + return __qpack_dht_make_room(dht, needed); +} + +/* allocate a dynamic headers table of <size> bytes and return it initialized */ +static inline void qpack_dht_init(struct qpack_dht *dht, uint32_t size) +{ + dht->size = size; + dht->total = 0; + dht->used = 0; +} + +/* allocate a dynamic headers table from the pool and return it initialized */ +static inline struct qpack_dht *qpack_dht_alloc() +{ + struct qpack_dht *dht; + + if (unlikely(!pool_head_qpack_tbl)) + return NULL; + + dht = qpack_alloc(pool_head_qpack_tbl); + if (dht) + qpack_dht_init(dht, pool_head_qpack_tbl->size); + return dht; +} + +/* free a dynamic headers table */ +static inline void qpack_dht_free(struct qpack_dht *dht) +{ + qpack_free(pool_head_qpack_tbl, dht); +} + +#endif /* _HAPROXY_QPACK_TBL_H */ diff --git a/include/haproxy/queue-t.h b/include/haproxy/queue-t.h new file mode 100644 index 0000000..8f6a1ec --- /dev/null +++ b/include/haproxy/queue-t.h @@ -0,0 +1,59 @@ +/* + * include/haproxy/queue-t.h + * This file defines variables and structures needed for queues. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUEUE_T_H +#define _HAPROXY_QUEUE_T_H + +#include <import/ebtree-t.h> +#include <haproxy/api-t.h> + +struct proxy; +struct server; +struct stream; +struct queue; + +struct pendconn { + int strm_flags; /* stream flags */ + unsigned int queue_idx; /* value of proxy/server queue_idx at time of enqueue */ + struct stream *strm; + struct queue *queue; /* the queue the entry is queued into */ + struct server *target; /* the server that was assigned, = srv except if srv==NULL */ + struct eb32_node node; + __decl_thread(HA_SPINLOCK_T del_lock); /* use before removal, always under queue's lock */ +}; + +struct queue { + struct eb_root head; /* queued pendconnds */ + struct proxy *px; /* the proxy we're waiting for, never NULL in queue */ + struct server *sv; /* the server we are waiting for, may be NULL if don't care */ + __decl_thread(HA_SPINLOCK_T lock); /* for manipulations in the tree */ + unsigned int idx; /* current queuing index */ + unsigned int length; /* number of entries */ +}; + +#endif /* _HAPROXY_QUEUE_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/queue.h b/include/haproxy/queue.h new file mode 100644 index 0000000..e77370c --- /dev/null +++ b/include/haproxy/queue.h @@ -0,0 +1,134 @@ +/* + * include/haproxy/queue.h + * This file defines everything related to queues. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUEUE_H +#define _HAPROXY_QUEUE_H + +#include <haproxy/api.h> +#include <haproxy/backend.h> +#include <haproxy/pool.h> +#include <haproxy/proxy-t.h> +#include <haproxy/queue-t.h> +#include <haproxy/server-t.h> +#include <haproxy/stream-t.h> + +extern struct pool_head *pool_head_pendconn; + +struct pendconn *pendconn_add(struct stream *strm); +int pendconn_dequeue(struct stream *strm); +void process_srv_queue(struct server *s); +unsigned int srv_dynamic_maxconn(const struct server *s); +int pendconn_redistribute(struct server *s); +int pendconn_grab_from_px(struct server *s); +void pendconn_unlink(struct pendconn *p); + +/* Removes the pendconn from the server/proxy queue. It supports being called + * with NULL for pendconn and with a pendconn not in the list. It is the + * function to be used by default when unsure. Do not call it with server + * or proxy locks held however. Warning: this is called from stream_free() + * which may run concurrently with pendconn_process_next_strm() which can be + * dequeuing the entry. The function must not return until the pendconn is + * guaranteed not to be known, which means that we must check its presence + * in the tree under the queue's lock so that penconn_process_next_strm() + * finishes before we return in case it would have grabbed this pendconn. See + * github bugs #880 and #908, and the commit log for this fix for more details. + */ +static inline void pendconn_cond_unlink(struct pendconn *p) +{ + if (p) + pendconn_unlink(p); +} + +/* Releases the pendconn associated to stream <s> if it has any, and decreases + * the pending count if needed. The connection might have been queued to a + * specific server as well as to the proxy. The stream also gets marked + * unqueued. + * + * This function must be called by the stream itself, so in the context of + * process_stream, without any lock held among the pendconn, the server's queue + * nor the proxy's queue. + */ +static inline void pendconn_free(struct stream *s) +{ + struct pendconn *p = s->pend_pos; + + if (p) { + pendconn_cond_unlink(p); + s->pend_pos = NULL; + pool_free(pool_head_pendconn, p); + } +} + +/* Returns 0 if all slots are full on a server, or 1 if there are slots available. */ +static inline int server_has_room(const struct server *s) { + return !s->maxconn || s->cur_sess < srv_dynamic_maxconn(s); +} + +/* returns 0 if nothing has to be done for server <s> regarding queued connections, + * and non-zero otherwise. If the server is down, we only check its own queue. Suited + * for and if/else usage. + */ +static inline int may_dequeue_tasks(const struct server *s, const struct proxy *p) { + return (s && (s->queue.length || (p->queue.length && srv_currently_usable(s))) && + (!s->maxconn || s->cur_sess < srv_dynamic_maxconn(s))); +} + +static inline int queue_limit_class(int class) +{ + if (class < -0x7ff) + return -0x7ff; + if (class > 0x7ff) + return 0x7ff; + return class; +} + +static inline int queue_limit_offset(int offset) +{ + if (offset < -0x7ffff) + return -0x7ffff; + if (offset > 0x7ffff) + return 0x7ffff; + return offset; +} + +/* initialize the queue <queue> for proxy <px> and server <sv>. A server's + * always has both a valid proxy and a valid server. A proxy's queue only + * has a valid proxy and NULL for the server queue. This is how they're + * distinguished during operations. + */ +static inline void queue_init(struct queue *queue, struct proxy *px, struct server *sv) +{ + queue->head = EB_ROOT; + queue->length = 0; + queue->idx = 0; + queue->px = px; + queue->sv = sv; + HA_SPIN_INIT(&queue->lock); +} + +#endif /* _HAPROXY_QUEUE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/quic_ack-t.h b/include/haproxy/quic_ack-t.h new file mode 100644 index 0000000..95b77f1 --- /dev/null +++ b/include/haproxy/quic_ack-t.h @@ -0,0 +1,43 @@ +/* + * include/haproxy/quic_ack-t.h + * Definitions for QUIC acknowledgements internal types, constants and flags. + * + * Copyright (C) 2023 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#ifndef _HAPROXY_QUIC_ACK_T_H +#define _HAPROXY_QUIC_ACK_T_H + +/* The maximum number of ack ranges to be built in ACK frames */ +#define QUIC_MAX_ACK_RANGES 32 + +/* Structure to maintain a set of ACK ranges to be used to build ACK frames. */ +struct quic_arngs { + /* ebtree of ACK ranges organized by their first value. */ + struct eb_root root; + /* The number of ACK ranges is this tree */ + size_t sz; + /* The number of bytes required to encode this ACK ranges lists. */ + size_t enc_sz; +}; + +/* Structure to hold a range of ACKs sent in ACK frames. */ +struct quic_arng { + int64_t first; + int64_t last; +}; + +/* Structure to hold a range of ACKs to be store as a node in a tree of + * ACK ranges. + */ +struct quic_arng_node { + struct eb64_node first; + uint64_t last; +}; + +#endif /* _HAPROXY_QUIC_ACK_T_H */ diff --git a/include/haproxy/quic_ack.h b/include/haproxy/quic_ack.h new file mode 100644 index 0000000..540e2c0 --- /dev/null +++ b/include/haproxy/quic_ack.h @@ -0,0 +1,23 @@ +/* + * include/proto/quic_ack.h + * This file provides definitions for QUIC acknowledgements. + * + * Copyright (C) 2023 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _HAPROXY_QUIC_ACK_H +#define _HAPROXY_QUIC_ACK_H + +void quic_free_arngs(struct quic_conn *qc, struct quic_arngs *arngs); +int quic_update_ack_ranges_list(struct quic_conn *qc, + struct quic_arngs *arngs, + struct quic_arng *ar); +void qc_treat_ack_of_ack(struct quic_conn *qc, struct quic_arngs *arngs, + int64_t largest_acked_pn); + +#endif /* _HAPROXY_QUIC_ACK_H */ diff --git a/include/haproxy/quic_cc-t.h b/include/haproxy/quic_cc-t.h new file mode 100644 index 0000000..888efca --- /dev/null +++ b/include/haproxy/quic_cc-t.h @@ -0,0 +1,123 @@ +/* + * include/haproxy/quic_cc-t.h + * This file contains definitions for QUIC congestion control. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_CC_H +#define _HAPROXY_QUIC_CC_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> +#include <stddef.h> /* size_t */ + +#include <haproxy/buf-t.h> +#include <haproxy/quic_loss-t.h> + +#define QUIC_CC_INFINITE_SSTHESH ((uint32_t)-1) + +extern struct quic_cc_algo quic_cc_algo_nr; +extern struct quic_cc_algo quic_cc_algo_cubic; +extern struct quic_cc_algo *default_quic_cc_algo; + +/* Fake algorithm with its fixed window */ +extern struct quic_cc_algo quic_cc_algo_nocc; + +extern unsigned long long last_ts; + +enum quic_cc_algo_state_type { + /* Slow start. */ + QUIC_CC_ST_SS, + /* Congestion avoidance. */ + QUIC_CC_ST_CA, + /* Recovery period. */ + QUIC_CC_ST_RP, +}; + +enum quic_cc_event_type { + /* ACK receipt. */ + QUIC_CC_EVT_ACK, + /* Packet loss. */ + QUIC_CC_EVT_LOSS, + /* ECN-CE. */ + QUIC_CC_EVT_ECN_CE, +}; + +struct quic_cc_event { + enum quic_cc_event_type type; + union { + struct ack { + uint64_t acked; + unsigned int time_sent; + } ack; + struct loss { + unsigned int time_sent; + } loss; + }; +}; + +enum quic_cc_algo_type { + QUIC_CC_ALGO_TP_NEWRENO, + QUIC_CC_ALGO_TP_CUBIC, + QUIC_CC_ALGO_TP_NOCC, +}; + +struct quic_cc { + /* <conn> is there only for debugging purpose. */ + struct quic_conn *qc; + struct quic_cc_algo *algo; + uint32_t priv[16]; +}; + +struct quic_cc_path { + /* Control congestion. */ + struct quic_cc cc; + /* Packet loss detection information. */ + struct quic_loss loss; + + /* MTU. */ + size_t mtu; + /* Congestion window. */ + uint64_t cwnd; + /* The current maximum congestion window value reached. */ + uint64_t mcwnd; + /* The maximum congestion window value which can be reached. */ + uint64_t max_cwnd; + /* Minimum congestion window. */ + uint64_t min_cwnd; + /* Prepared data to be sent (in bytes). */ + uint64_t prep_in_flight; + /* Outstanding data (in bytes). */ + uint64_t in_flight; + /* Number of in flight ack-eliciting packets. */ + uint64_t ifae_pkts; +}; + +struct quic_cc_algo { + enum quic_cc_algo_type type; + int (*init)(struct quic_cc *cc); + void (*event)(struct quic_cc *cc, struct quic_cc_event *ev); + void (*slow_start)(struct quic_cc *cc); + void (*state_trace)(struct buffer *buf, const struct quic_cc *cc); +}; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_CC_H */ diff --git a/include/haproxy/quic_cc.h b/include/haproxy/quic_cc.h new file mode 100644 index 0000000..721feca --- /dev/null +++ b/include/haproxy/quic_cc.h @@ -0,0 +1,112 @@ +/* + * include/proto/quic_cc.h + * This file contains prototypes for QUIC congestion control. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _PROTO_QUIC_CC_H +#define _PROTO_QUIC_CC_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/api.h> +#include <haproxy/buf.h> +#include <haproxy/chunk.h> +#include <haproxy/quic_cc-t.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_loss.h> + +void quic_cc_init(struct quic_cc *cc, struct quic_cc_algo *algo, struct quic_conn *qc); +void quic_cc_event(struct quic_cc *cc, struct quic_cc_event *ev); +void quic_cc_state_trace(struct buffer *buf, const struct quic_cc *cc); + +static inline const char *quic_cc_state_str(enum quic_cc_algo_state_type state) +{ + switch (state) { + case QUIC_CC_ST_SS: + return "ss"; + case QUIC_CC_ST_CA: + return "ca"; + case QUIC_CC_ST_RP: + return "rp"; + default: + return "unknown"; + } +} + +/* Return a human readable string from <ev> control congestion event type. */ +static inline void quic_cc_event_trace(struct buffer *buf, const struct quic_cc_event *ev) +{ + chunk_appendf(buf, " event="); + switch (ev->type) { + case QUIC_CC_EVT_ACK: + chunk_appendf(buf, "ack acked=%llu time_sent:%dms", + (unsigned long long)ev->ack.acked, TICKS_TO_MS(tick_remain(ev->ack.time_sent, now_ms))); + break; + case QUIC_CC_EVT_LOSS: + chunk_appendf(buf, "loss time_sent=%dms", TICKS_TO_MS(tick_remain(ev->loss.time_sent, now_ms))); + break; + case QUIC_CC_EVT_ECN_CE: + chunk_appendf(buf, "ecn_ce"); + break; + } +} + +static inline void *quic_cc_priv(const struct quic_cc *cc) +{ + return (void *)cc->priv; +} + +/* Initialize <p> QUIC network path depending on <ipv4> boolean + * which is true for an IPv4 path, if not false for an IPv6 path. + */ +static inline void quic_cc_path_init(struct quic_cc_path *path, int ipv4, unsigned long max_cwnd, + struct quic_cc_algo *algo, struct quic_conn *qc) +{ + unsigned int max_dgram_sz; + + max_dgram_sz = ipv4 ? QUIC_INITIAL_IPV4_MTU : QUIC_INITIAL_IPV6_MTU; + quic_loss_init(&path->loss); + path->mtu = max_dgram_sz; + path->cwnd = QUIC_MIN(10 * max_dgram_sz, QUIC_MAX(max_dgram_sz << 1, 14720U)); + path->mcwnd = path->cwnd; + path->max_cwnd = max_cwnd; + path->min_cwnd = max_dgram_sz << 1; + path->prep_in_flight = 0; + path->in_flight = 0; + path->ifae_pkts = 0; + quic_cc_init(&path->cc, algo, qc); +} + +/* Return the remaining <room> available on <path> QUIC path for prepared data + * (before being sent). Almost the same that for the QUIC path room, except that + * here this is the data which have been prepared which are taken into an account. + */ +static inline size_t quic_cc_path_prep_data(struct quic_cc_path *path) +{ + if (path->prep_in_flight > path->cwnd) + return 0; + + return path->cwnd - path->prep_in_flight; +} + + +#endif /* USE_QUIC */ +#endif /* _PROTO_QUIC_CC_H */ diff --git a/include/haproxy/quic_cid-t.h b/include/haproxy/quic_cid-t.h new file mode 100644 index 0000000..ccce844 --- /dev/null +++ b/include/haproxy/quic_cid-t.h @@ -0,0 +1,38 @@ +#ifndef _HAPROXY_QUIC_CID_T_H +#define _HAPROXY_QUIC_CID_T_H + +#include <import/ebtree-t.h> +#include <haproxy/quic_tp-t.h> + +/* QUIC connection ID maximum length for version 1. */ +#define QUIC_CID_MAXLEN 20 /* bytes */ + +/* QUIC connection id data. + * + * This struct is used by ebmb_node structs as last member of flexible arrays. + * So do not change the order of the member of quic_cid struct. + * <data> member must be the first one. + */ +struct quic_cid { + unsigned char data[QUIC_CID_MAXLEN]; + unsigned char len; /* size of QUIC CID */ +}; + +/* QUIC connection id attached to a QUIC connection. + * + * This structure is used to match received packets DCIDs with the + * corresponding QUIC connection. + */ +struct quic_connection_id { + struct eb64_node seq_num; + uint64_t retire_prior_to; + unsigned char stateless_reset_token[QUIC_STATELESS_RESET_TOKEN_LEN]; + + struct ebmb_node node; /* node for receiver tree, cid.data as key */ + struct quic_cid cid; /* CID data */ + + struct quic_conn *qc; /* QUIC connection using this CID */ + uint tid; /* Attached Thread ID for the connection. */ +}; + +#endif /* _HAPROXY_QUIC_CID_T_H */ diff --git a/include/haproxy/quic_cid.h b/include/haproxy/quic_cid.h new file mode 100644 index 0000000..482a020 --- /dev/null +++ b/include/haproxy/quic_cid.h @@ -0,0 +1,110 @@ +#ifndef _HAPROXY_QUIC_CID_H +#define _HAPROXY_QUIC_CID_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <import/ebmbtree.h> + +#include <haproxy/buf-t.h> +#include <haproxy/chunk.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_rx-t.h> +#include <haproxy/proto_quic.h> + +struct quic_connection_id *new_quic_cid(struct eb_root *root, + struct quic_conn *qc, + const struct quic_cid *orig, + const struct sockaddr_storage *addr); +int quic_get_cid_tid(const unsigned char *cid, size_t cid_len, + const struct sockaddr_storage *cli_addr, + unsigned char *pos, size_t len); +struct quic_cid quic_derive_cid(const struct quic_cid *orig, + const struct sockaddr_storage *addr); +struct quic_conn *retrieve_qc_conn_from_cid(struct quic_rx_packet *pkt, + struct sockaddr_storage *saddr, + int *new_tid); +int qc_build_new_connection_id_frm(struct quic_conn *qc, + struct quic_connection_id *conn_id); + +/* Copy <src> QUIC CID to <dst>. + * This is the responsibility of the caller to check there is enough room in + * <dst> to copy <src>. + * Always succeeds. + */ +static inline void quic_cid_cpy(struct quic_cid *dst, const struct quic_cid *src) +{ + memcpy(dst->data, src->data, src->len); + dst->len = src->len; +} + +/* Dump the QUIC connection ID value if present (non null length). Used only for + * debugging purposes. + * Always succeeds. + */ +static inline void quic_cid_dump(struct buffer *buf, + const struct quic_cid *cid) +{ + int i; + + chunk_appendf(buf, "(%d", cid->len); + if (cid->len) + chunk_appendf(buf, ","); + for (i = 0; i < cid->len; i++) + chunk_appendf(buf, "%02x", cid->data[i]); + chunk_appendf(buf, ")"); +} + +/* Return tree index where <cid> is stored. */ +static inline uchar _quic_cid_tree_idx(const unsigned char *cid) +{ + return cid[0]; +} + +/* Return tree index where <cid> is stored. */ +static inline uchar quic_cid_tree_idx(const struct quic_cid *cid) +{ + return _quic_cid_tree_idx(cid->data); +} + +/* Insert <conn_id> into global CID tree as a thread-safe operation. */ +static inline void quic_cid_insert(struct quic_connection_id *conn_id) +{ + const uchar idx = quic_cid_tree_idx(&conn_id->cid); + struct quic_cid_tree *tree = &quic_cid_trees[idx]; + + HA_RWLOCK_WRLOCK(QC_CID_LOCK, &tree->lock); + ebmb_insert(&tree->root, &conn_id->node, conn_id->cid.len); + HA_RWLOCK_WRUNLOCK(QC_CID_LOCK, &tree->lock); +} + +/* Remove <conn_id> from global CID tree as a thread-safe operation. */ +static inline void quic_cid_delete(struct quic_connection_id *conn_id) +{ + const uchar idx = quic_cid_tree_idx(&conn_id->cid); + struct quic_cid_tree __maybe_unused *tree = &quic_cid_trees[idx]; + + HA_RWLOCK_WRLOCK(QC_CID_LOCK, &tree->lock); + ebmb_delete(&conn_id->node); + HA_RWLOCK_WRUNLOCK(QC_CID_LOCK, &tree->lock); +} + +/* Copy <src> new connection ID information to <dst> NEW_CONNECTION_ID frame. + * Always succeeds. + */ +static inline void quic_connection_id_to_frm_cpy(struct quic_frame *dst, + struct quic_connection_id *src) +{ + struct qf_new_connection_id *ncid_frm = &dst->new_connection_id; + + ncid_frm->seq_num = src->seq_num.key; + ncid_frm->retire_prior_to = src->retire_prior_to; + ncid_frm->cid.len = src->cid.len; + ncid_frm->cid.data = src->cid.data; + ncid_frm->stateless_reset_token = src->stateless_reset_token; +} + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_CID_H */ diff --git a/include/haproxy/quic_cli-t.h b/include/haproxy/quic_cli-t.h new file mode 100644 index 0000000..6f95899 --- /dev/null +++ b/include/haproxy/quic_cli-t.h @@ -0,0 +1,18 @@ +/* + * include/haproxy/quic_trace-t.h + * Definitions for QUIC CLI internal types, constants and flags. + * + * Copyright (C) 2023 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ +#ifndef _HAPROXY_QUIC_CLI_T_H +#define _HAPROXY_QUIC_CLI_T_H + +extern unsigned int qc_epoch; + +#endif /* _HAPROXY_QUIC_CLI_T_H */ diff --git a/include/haproxy/quic_conn-t.h b/include/haproxy/quic_conn-t.h new file mode 100644 index 0000000..8aec6f0 --- /dev/null +++ b/include/haproxy/quic_conn-t.h @@ -0,0 +1,446 @@ +/* + * include/haproxy/quic_conn-t.h + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_CONN_T_H +#define _HAPROXY_QUIC_CONN_T_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <sys/socket.h> + +#include <haproxy/cbuf-t.h> +#include <haproxy/list.h> + +#include <haproxy/openssl-compat.h> +#include <haproxy/mux_quic-t.h> +#include <haproxy/quic_cid-t.h> +#include <haproxy/quic_cc-t.h> +#include <haproxy/quic_loss-t.h> +#include <haproxy/quic_openssl_compat-t.h> +#include <haproxy/quic_stats-t.h> +#include <haproxy/quic_tls-t.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/task.h> + +#include <import/ebtree-t.h> + +typedef unsigned long long ull; + +#define QUIC_PROTOCOL_VERSION_DRAFT_29 0xff00001d /* draft-29 */ +#define QUIC_PROTOCOL_VERSION_1 0x00000001 /* V1 */ +#define QUIC_PROTOCOL_VERSION_2 0x6b3343cf /* V2 */ + +#define QUIC_INITIAL_IPV4_MTU 1252 /* (bytes) */ +#define QUIC_INITIAL_IPV6_MTU 1232 + +/* The minimum length of Initial packets. */ +#define QUIC_INITIAL_PACKET_MINLEN 1200 + +/* Lengths of the QUIC CIDs generated by the haproxy implementation. Current + * value is used to match 64 bits hash produced when deriving ODCID. + */ +#define QUIC_HAP_CID_LEN 8 + +/* Common definitions for short and long QUIC packet headers. */ +/* QUIC original destination connection ID minial length */ +#define QUIC_ODCID_MINLEN 8 /* bytes */ +/* + * All QUIC packets with long headers are made of at least (in bytes): + * flags(1), version(4), DCID length(1), DCID(0..20), SCID length(1), SCID(0..20) + */ +#define QUIC_LONG_PACKET_MINLEN 7 +/* DCID offset from beginning of a long packet */ +#define QUIC_LONG_PACKET_DCID_OFF (1 + sizeof(uint32_t)) +/* + * All QUIC packets with short headers are made of at least (in bytes): + * flags(1), DCID(0..20) + */ +#define QUIC_SHORT_PACKET_MINLEN 1 +/* DCID offset from beginning of a short packet */ +#define QUIC_SHORT_PACKET_DCID_OFF 1 + +/* Byte 0 of QUIC packets. */ +#define QUIC_PACKET_LONG_HEADER_BIT 0x80 /* Long header format if set, short if not. */ +#define QUIC_PACKET_FIXED_BIT 0x40 /* Must always be set for all the headers. */ + +/* Tokens formats */ +/* Format for Retry tokens sent by a QUIC server */ +#define QUIC_TOKEN_FMT_RETRY 0x9c +/* Format for token sent for new connections after a Retry token was sent */ +#define QUIC_TOKEN_FMT_NEW 0xb7 +/* Retry token duration */ +#define QUIC_RETRY_DURATION_SEC 10 +/* Default Retry threshold */ +#define QUIC_DFLT_RETRY_THRESHOLD 100 /* in connection openings */ +/* Default ratio value applied to a dynamic Packet reorder threshold. */ +#define QUIC_DFLT_REORDER_RATIO 50 /* in percent */ +/* Default limit of loss detection on a single frame. If exceeded, connection is closed. */ +#define QUIC_DFLT_MAX_FRAME_LOSS 10 + +/* + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+ + * |1|1|T|T|X|X|X|X| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Version (32) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | DCID Len (8) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Destination Connection ID (0..160) ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | SCID Len (8) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Source Connection ID (0..160) ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * Long Header Packet Format + */ + +/* Two bits (T) for QUIC packet types. */ +#define QUIC_PACKET_TYPE_BITMASK 0x03 +#define QUIC_PACKET_TYPE_SHIFT 4 + +enum quic_pkt_type { + QUIC_PACKET_TYPE_INITIAL, + QUIC_PACKET_TYPE_0RTT, + QUIC_PACKET_TYPE_HANDSHAKE, + QUIC_PACKET_TYPE_RETRY, + /* + * The following one is not defined by the RFC but we define it for our + * own convenience. + */ + QUIC_PACKET_TYPE_SHORT, + + /* unknown type */ + QUIC_PACKET_TYPE_UNKNOWN +}; + +/* Packet number field length. */ +#define QUIC_PACKET_PNL_BITMASK 0x03 +#define QUIC_PACKET_PN_MAXLEN 4 + +/* + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+ + * |0|1|S|R|R|K|P|P| + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Destination Connection ID (0..160) ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Packet Number (8/16/24/32) ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Protected Payload (*) ... + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * Short Header Packet Format + */ + +/* Bit (S) of short header. */ +#define QUIC_PACKET_SPIN_BIT 0x20 + +/* Reserved Bits (R): The next two bits of byte 0 are reserved. + * These bits are protected using header protection + * (see Section 5.4 of [QUIC-TLS]). The value included + * prior to protection MUST be set to 0. An endpoint MUST treat + * receipt of a packet that has a non-zero value for these bits, + * after removing both packet and header protection, as a connection + * error of type PROTOCOL_VIOLATION. Discarding such a packet after + * only removing header protection can expose the endpoint to attacks + * (see Section 9.3 of [QUIC-TLS]). + */ +#define QUIC_PACKET_RESERVED_BITS 0x18 /* (protected) */ + +#define QUIC_PACKET_KEY_PHASE_BIT 0x04 /* (protected) */ + +/* The maximum number of QUIC packets stored by the fd I/O handler by QUIC + * connection. Must be a power of two. + */ +#define QUIC_CONN_MAX_PACKET 64 + +#define QUIC_STATELESS_RESET_PACKET_HEADER_LEN 5 +#define QUIC_STATELESS_RESET_PACKET_MINLEN (22 + QUIC_HAP_CID_LEN) + +/* Similar to kernel min()/max() definitions. */ +#define QUIC_MIN(a, b) ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + (void) (&_a == &_b); \ + _a < _b ? _a : _b; }) + +#define QUIC_MAX(a, b) ({ \ + typeof(a) _a = (a); \ + typeof(b) _b = (b); \ + (void) (&_a == &_b); \ + _a > _b ? _a : _b; }) + +/* Size of the QUIC RX buffer for the connections */ +#define QUIC_CONN_RX_BUFSZ (1UL << 16) + +struct quic_version { + uint32_t num; + const unsigned char *initial_salt; + size_t initial_salt_len; + const unsigned char *key_label; + size_t key_label_len; + const unsigned char *iv_label; + size_t iv_label_len; + const unsigned char *hp_label; + size_t hp_label_len; + const unsigned char *ku_label; + size_t ku_label_len; + /* Retry tag */ + const unsigned char *retry_tag_key; + const unsigned char *retry_tag_nonce; +}; + +extern const struct quic_version quic_versions[]; +extern const size_t quic_versions_nb; +extern const struct quic_version *preferred_version; + +/* unused: 0x01 */ +/* Flag the packet number space as requiring an ACK frame to be sent. */ +#define QUIC_FL_PKTNS_ACK_REQUIRED (1UL << 1) +/* Flag the packet number space as needing probing */ +#define QUIC_FL_PKTNS_PROBE_NEEDED (1UL << 2) +/* Flag the packet number space as having received a packet with a new largest + * packet number, to be acknowledege + */ +#define QUIC_FL_PKTNS_NEW_LARGEST_PN (1UL << 3) + +/* The maximum number of dgrams which may be sent upon PTO expirations. */ +#define QUIC_MAX_NB_PTO_DGRAMS 2 + +/* The QUIC packet numbers are 62-bits integers */ +#define QUIC_MAX_PACKET_NUM ((1ULL << 62) - 1) + +/* The maximum number of bytes of CRYPTO data in flight during handshakes. */ +#define QUIC_CRYPTO_IN_FLIGHT_MAX 4096 + +/* Status of the connection/mux layer. This defines how to handle app data. + * + * During a standard quic_conn lifetime it transitions like this : + * QC_MUX_NULL -> QC_MUX_READY -> QC_MUX_RELEASED + */ +enum qc_mux_state { + QC_MUX_NULL, /* not allocated, data should be buffered */ + QC_MUX_READY, /* allocated, ready to handle data */ + QC_MUX_RELEASED, /* released, data can be dropped */ +}; + +/* Counters at QUIC connection level */ +struct quic_conn_cntrs { + long long dropped_pkt; /* total number of dropped packets */ + long long dropped_pkt_bufoverrun;/* total number of dropped packets because of buffer overrun */ + long long dropped_parsing; /* total number of dropped packets upon parsing errors */ + long long socket_full; /* total number of EAGAIN errors on sendto() calls */ + long long sendto_err; /* total number of errors on sendto() calls, EAGAIN excepted */ + long long sendto_err_unknown; /* total number of errors on sendto() calls which are currently not supported */ + long long sent_pkt; /* total number of sent packets */ + long long lost_pkt; /* total number of lost packets */ + long long conn_migration_done; /* total number of connection migration handled */ + /* Streams related counters */ + long long data_blocked; /* total number of times DATA_BLOCKED frame was received */ + long long stream_data_blocked; /* total number of times STREAM_DATA_BLOCKED frame was received */ + long long streams_blocked_bidi; /* total number of times STREAMS_BLOCKED_BIDI frame was received */ + long long streams_blocked_uni; /* total number of times STREAMS_BLOCKED_UNI frame was received */ +}; + +/* Flags at connection level */ +#define QUIC_FL_CONN_ANTI_AMPLIFICATION_REACHED (1U << 0) +#define QUIC_FL_CONN_SPIN_BIT (1U << 1) /* Spin bit set by remote peer */ +#define QUIC_FL_CONN_NEED_POST_HANDSHAKE_FRMS (1U << 2) /* HANDSHAKE_DONE must be sent */ +#define QUIC_FL_CONN_LISTENER (1U << 3) +#define QUIC_FL_CONN_ACCEPT_REGISTERED (1U << 4) +#define QUIC_FL_CONN_TX_MUX_CONTEXT (1U << 5) /* sending in progress from the MUX layer */ +#define QUIC_FL_CONN_IDLE_TIMER_RESTARTED_AFTER_READ (1U << 6) +#define QUIC_FL_CONN_RETRANS_NEEDED (1U << 7) +#define QUIC_FL_CONN_RETRANS_OLD_DATA (1U << 8) /* retransmission in progress for probing with already sent data */ +#define QUIC_FL_CONN_TLS_ALERT (1U << 9) +#define QUIC_FL_CONN_AFFINITY_CHANGED (1U << 10) /* qc_finalize_affinity_rebind() must be called to finalize affinity rebind */ +/* gap here */ +#define QUIC_FL_CONN_HALF_OPEN_CNT_DECREMENTED (1U << 11) /* The half-open connection counter was decremented */ +#define QUIC_FL_CONN_HANDSHAKE_SPEED_UP (1U << 12) /* Handshake speeding up was done */ +#define QUIC_FL_CONN_ACK_TIMER_FIRED (1U << 13) /* idle timer triggered for acknowledgements */ +#define QUIC_FL_CONN_IO_TO_REQUEUE (1U << 14) /* IO handler must be requeued on new thread after connection migration */ +#define QUIC_FL_CONN_IPKTNS_DCD (1U << 15) /* Initial packet number space discarded */ +#define QUIC_FL_CONN_HPKTNS_DCD (1U << 16) /* Handshake packet number space discarded */ +#define QUIC_FL_CONN_PEER_VALIDATED_ADDR (1U << 17) /* Peer address is considered as validated for this connection. */ +#define QUIC_FL_CONN_TO_KILL (1U << 24) /* Unusable connection, to be killed */ +#define QUIC_FL_CONN_TX_TP_RECEIVED (1U << 25) /* Peer transport parameters have been received (used for the transmitting part) */ +#define QUIC_FL_CONN_FINALIZED (1U << 26) /* QUIC connection finalized (functional, ready to send/receive) */ +/* gap here */ +#define QUIC_FL_CONN_EXP_TIMER (1U << 28) /* timer has expired, quic-conn can be freed */ +#define QUIC_FL_CONN_CLOSING (1U << 29) /* closing state, entered on CONNECTION_CLOSE emission */ +#define QUIC_FL_CONN_DRAINING (1U << 30) /* draining state, entered on CONNECTION_CLOSE reception */ +#define QUIC_FL_CONN_IMMEDIATE_CLOSE (1U << 31) /* A CONNECTION_CLOSE must be sent */ + +#define QUIC_CONN_COMMON \ + struct { \ + /* Connection owned socket FD. */ \ + int fd; \ + unsigned int flags; \ + struct quic_err err; \ + /* When in closing state, number of packet before sending CC */ \ + unsigned int nb_pkt_for_cc; \ + /* When in closing state, number of packet since receiving CC */ \ + unsigned int nb_pkt_since_cc; \ + struct wait_event wait_event; \ + struct wait_event *subs; \ + struct sockaddr_storage local_addr; \ + struct sockaddr_storage peer_addr; \ + struct { \ + /* Number of bytes for prepared packets */ \ + uint64_t prep; \ + /* Number of sent bytes. */ \ + uint64_t tx; \ + /* Number of received bytes. */ \ + uint64_t rx; \ + } bytes; \ + /* First DCID used by client on its Initial packet. */ \ + struct quic_cid odcid; \ + /* DCID of our endpoint - not updated when a new DCID is used */ \ + struct quic_cid dcid; \ + /* first SCID of our endpoint - not updated when a new SCID is used */ \ + struct quic_cid scid; \ + /* tree of quic_connection_id - used to match a received packet DCID \ + * with a connection \ + */ \ + struct eb_root *cids; \ + struct listener *li; /* only valid for frontend connections */ \ + /* Idle timer task */ \ + struct task *idle_timer_task; \ + unsigned int idle_expire; \ + /* QUIC connection level counters */ \ + struct quic_conn_cntrs cntrs; \ + struct connection *conn; \ + } + +struct quic_conn { + QUIC_CONN_COMMON; + /* Used only to reach the tasklet for the I/O handler from this + * quic_conn object. + */ + struct ssl_sock_ctx *xprt_ctx; + const struct quic_version *original_version; + const struct quic_version *negotiated_version; + /* Negotiated version Initial TLS context */ + struct quic_tls_ctx *nictx; + /* QUIC transport parameters TLS extension */ + int tps_tls_ext; + int state; + enum qc_mux_state mux_state; /* status of the connection/mux layer */ +#ifdef USE_QUIC_OPENSSL_COMPAT + unsigned char enc_params[QUIC_TP_MAX_ENCLEN]; /* encoded QUIC transport parameters */ + size_t enc_params_len; +#endif + + uint64_t next_cid_seq_num; + /* Initial hash computed from first ID (derived from ODCID). + * it could be reused to derive extra CIDs from the same hash + */ + uint64_t hash64; + + /* Initial encryption level */ + struct quic_enc_level *iel; + /* 0-RTT encryption level */ + struct quic_enc_level *eel; + /* Handshake encryption level */ + struct quic_enc_level *hel; + /* 1-RTT encryption level */ + struct quic_enc_level *ael; + /* List of allocated QUIC TLS encryption level */ + struct list qel_list; + + struct quic_pktns *ipktns; + struct quic_pktns *hpktns; + struct quic_pktns *apktns; + /* List of packet number spaces attached to this connection */ + struct list pktns_list; + +#ifdef USE_QUIC_OPENSSL_COMPAT + struct quic_openssl_compat openssl_compat; +#endif + + struct { + /* Transport parameters sent by the peer */ + struct quic_transport_params params; + /* Send buffer used to write datagrams. */ + struct buffer buf; + /* Send buffer used to send a "connection close" datagram . */ + struct buffer cc_buf; + char *cc_buf_area; + /* Length of the "connection close" datagram. */ + size_t cc_dgram_len; + } tx; + struct { + /* Transport parameters the peer will receive */ + struct quic_transport_params params; + /* RX buffer */ + struct buffer buf; + struct list pkt_list; + struct { + /* Number of open or closed streams */ + uint64_t nb_streams; + } strms[QCS_MAX_TYPES]; + } rx; + struct { + struct quic_tls_kp prv_rx; + struct quic_tls_kp nxt_rx; + struct quic_tls_kp nxt_tx; + } ku; + unsigned int max_ack_delay; + unsigned int max_idle_timeout; + struct quic_cc_path paths[1]; + struct quic_cc_path *path; + + struct mt_list accept_list; /* chaining element used for accept, only valid for frontend connections */ + + struct eb_root streams_by_id; /* qc_stream_desc tree */ + int stream_buf_count; /* total count of allocated stream buffers for this connection */ + + /* MUX */ + struct qcc *qcc; + struct task *timer_task; + unsigned int timer; + unsigned int ack_expire; + /* Handshake expiration date */ + unsigned int hs_expire; + + const struct qcc_app_ops *app_ops; + /* Proxy counters */ + struct quic_counters *prx_counters; + + struct list el_th_ctx; /* list elem in ha_thread_ctx */ + struct list back_refs; /* list head of CLI context currently dumping this connection. */ + unsigned int qc_epoch; /* delimiter for newer instances started after "show quic". */ +}; + +/* QUIC connection in "connection close" state. */ +struct quic_conn_closed { + QUIC_CONN_COMMON; + char *cc_buf_area; + /* Length of the "connection close" datagram. */ + size_t cc_dgram_len; +}; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_CONN_T_H */ diff --git a/include/haproxy/quic_conn.h b/include/haproxy/quic_conn.h new file mode 100644 index 0000000..92caed4 --- /dev/null +++ b/include/haproxy/quic_conn.h @@ -0,0 +1,201 @@ +/* + * include/haproxy/quic_conn.h + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_CONN_H +#define _HAPROXY_QUIC_CONN_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> + +#include <import/eb64tree.h> +#include <import/ebmbtree.h> + +#include <haproxy/chunk.h> +#include <haproxy/dynbuf.h> +#include <haproxy/ncbuf.h> +#include <haproxy/net_helper.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/ticks.h> + +#include <haproxy/listener.h> +#include <haproxy/proto_quic.h> +#include <haproxy/quic_cc.h> +#include <haproxy/quic_cid.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_frame.h> +#include <haproxy/quic_loss.h> +#include <haproxy/quic_rx.h> +#include <haproxy/mux_quic.h> + +#include <openssl/rand.h> + +extern struct pool_head *pool_head_quic_connection_id; + +int qc_conn_finalize(struct quic_conn *qc, int server); +int ssl_quic_initial_ctx(struct bind_conf *bind_conf); +struct quic_cstream *quic_cstream_new(struct quic_conn *qc); +void quic_cstream_free(struct quic_cstream *cs); +void quic_free_arngs(struct quic_conn *qc, struct quic_arngs *arngs); +struct quic_cstream *quic_cstream_new(struct quic_conn *qc); +struct task *quic_conn_app_io_cb(struct task *t, void *context, unsigned int state); + +struct quic_connection_id *new_quic_cid(struct eb_root *root, + struct quic_conn *qc, + const struct quic_cid *orig, + const struct sockaddr_storage *addr); +void quic_conn_closed_err_count_inc(struct quic_conn *qc, struct quic_frame *frm); +int qc_h3_request_reject(struct quic_conn *qc, uint64_t id); +struct quic_conn *qc_new_conn(const struct quic_version *qv, int ipv4, + struct quic_cid *dcid, struct quic_cid *scid, + const struct quic_cid *token_odcid, + struct quic_connection_id *conn_id, + struct sockaddr_storage *local_addr, + struct sockaddr_storage *peer_addr, + int server, int token, void *owner); +int quic_build_post_handshake_frames(struct quic_conn *qc); +const struct quic_version *qc_supported_version(uint32_t version); +int quic_peer_validated_addr(struct quic_conn *qc); +void qc_set_timer(struct quic_conn *qc); +void qc_detach_th_ctx_list(struct quic_conn *qc, int closing); +void qc_idle_timer_do_rearm(struct quic_conn *qc, int arm_ack); +void qc_idle_timer_rearm(struct quic_conn *qc, int read, int arm_ack); +void qc_check_close_on_released_mux(struct quic_conn *qc); +int quic_stateless_reset_token_cpy(unsigned char *pos, size_t len, + const unsigned char *salt, size_t saltlen); + +static inline int qc_is_listener(struct quic_conn *qc) +{ + return qc->flags & QUIC_FL_CONN_LISTENER; +} + +/* Free the CIDs attached to <conn> QUIC connection. */ +static inline void free_quic_conn_cids(struct quic_conn *conn) +{ + struct eb64_node *node; + + if (!conn->cids) + return; + + node = eb64_first(conn->cids); + while (node) { + struct quic_connection_id *conn_id; + + conn_id = eb64_entry(node, struct quic_connection_id, seq_num); + + /* remove the CID from the receiver tree */ + quic_cid_delete(conn_id); + + /* remove the CID from the quic_conn tree */ + node = eb64_next(node); + eb64_delete(&conn_id->seq_num); + pool_free(pool_head_quic_connection_id, conn_id); + } +} + +/* Move all the connection IDs from <conn> QUIC connection to <cc_conn> */ +static inline void quic_conn_mv_cids_to_cc_conn(struct quic_conn_closed *cc_conn, + struct quic_conn *conn) +{ + struct eb64_node *node; + + node = eb64_first(conn->cids); + while (node) { + struct quic_connection_id *conn_id; + + conn_id = eb64_entry(node, struct quic_connection_id, seq_num); + conn_id->qc = (struct quic_conn *)cc_conn; + node = eb64_next(node); + } + +} + +/* Allocate the underlying required memory for <ncbuf> non-contiguous buffer */ +static inline struct ncbuf *quic_get_ncbuf(struct ncbuf *ncbuf) +{ + struct buffer buf = BUF_NULL; + + if (!ncb_is_null(ncbuf)) + return ncbuf; + + b_alloc(&buf); + BUG_ON(b_is_null(&buf)); + + *ncbuf = ncb_make(buf.area, buf.size, 0); + ncb_init(ncbuf, 0); + + return ncbuf; +} + +/* Release the underlying memory use by <ncbuf> non-contiguous buffer */ +static inline void quic_free_ncbuf(struct ncbuf *ncbuf) +{ + struct buffer buf; + + if (ncb_is_null(ncbuf)) + return; + + buf = b_make(ncbuf->area, ncbuf->size, 0, 0); + b_free(&buf); + offer_buffers(NULL, 1); + + *ncbuf = NCBUF_NULL; +} + +void chunk_frm_appendf(struct buffer *buf, const struct quic_frame *frm); +void quic_set_connection_close(struct quic_conn *qc, const struct quic_err err); +void quic_set_tls_alert(struct quic_conn *qc, int alert); +int quic_set_app_ops(struct quic_conn *qc, const unsigned char *alpn, size_t alpn_len); +int qc_check_dcid(struct quic_conn *qc, unsigned char *dcid, size_t dcid_len); +struct quic_cid quic_derive_cid(const struct quic_cid *orig, + const struct sockaddr_storage *addr); +int quic_get_cid_tid(const unsigned char *cid, size_t cid_len, + const struct sockaddr_storage *cli_addr, + unsigned char *buf, size_t buf_len); +int qc_send_mux(struct quic_conn *qc, struct list *frms); + +void qc_notify_err(struct quic_conn *qc); +int qc_notify_send(struct quic_conn *qc); + +void qc_check_close_on_released_mux(struct quic_conn *qc); + +void quic_conn_release(struct quic_conn *qc); + +void qc_kill_conn(struct quic_conn *qc); + +int qc_parse_hd_form(struct quic_rx_packet *pkt, + unsigned char **buf, const unsigned char *end); + +int qc_set_tid_affinity(struct quic_conn *qc, uint new_tid, struct listener *new_li); +void qc_finalize_affinity_rebind(struct quic_conn *qc); +int qc_handle_conn_migration(struct quic_conn *qc, + const struct sockaddr_storage *peer_addr, + const struct sockaddr_storage *local_addr); + +/* Function pointer that can be used to compute a hash from first generated CID (derived from ODCID) */ +extern uint64_t (*quic_hash64_from_cid)(const unsigned char *cid, int size, const unsigned char *secret, size_t secretlen); +/* Function pointer that can be used to derive a new CID from the previously computed hash */ +extern void (*quic_newcid_from_hash64)(unsigned char *cid, int size, uint64_t hash, const unsigned char *secret, size_t secretlen); + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_CONN_H */ diff --git a/include/haproxy/quic_enc.h b/include/haproxy/quic_enc.h new file mode 100644 index 0000000..4b85605 --- /dev/null +++ b/include/haproxy/quic_enc.h @@ -0,0 +1,275 @@ +/* + * include/haproxy/quic_enc.h + * This file contains QUIC varint encoding function prototypes + * + * Copyright 2021 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_ENC_H +#define _HAPROXY_QUIC_ENC_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> + +#include <haproxy/buf.h> +#include <haproxy/chunk.h> + +/* The maximum size of a variable-length QUIC integer encoded with 1 byte */ +#define QUIC_VARINT_1_BYTE_MAX ((1UL << 6) - 1) +/* The maximum size of a variable-length QUIC integer encoded with 2 bytes */ +#define QUIC_VARINT_2_BYTE_MAX ((1UL << 14) - 1) +/* The maximum size of a variable-length QUIC integer encoded with 4 bytes */ +#define QUIC_VARINT_4_BYTE_MAX ((1UL << 30) - 1) +/* The maximum size of a variable-length QUIC integer encoded with 8 bytes */ +#define QUIC_VARINT_8_BYTE_MAX ((1ULL << 62) - 1) + +/* The maximum size of a variable-length QUIC integer */ +#define QUIC_VARINT_MAX_SIZE 8 + +/* The two most significant bits of byte #0 from a QUIC packet gives the 2 + * logarithm of the length of a variable length encoded integer. + */ +#define QUIC_VARINT_BYTE_0_BITMASK 0x3f +#define QUIC_VARINT_BYTE_0_SHIFT 6 + +/* Returns enough log2 of first powers of two to encode QUIC variable length + * integers. + * Returns -1 if <val> if out of the range of lengths supported by QUIC. + */ +static inline int quic_log2(unsigned int val) +{ + switch (val) { + case 8: + return 3; + case 4: + return 2; + case 2: + return 1; + case 1: + return 0; + default: + return -1; + } +} + +/* Returns the size in bytes required to encode a 64bits integer if + * not out of range (< (1 << 62)), or 0 if out of range. + */ +static inline size_t quic_int_getsize(uint64_t val) +{ + switch (val) { + case 0 ... QUIC_VARINT_1_BYTE_MAX: + return 1; + case QUIC_VARINT_1_BYTE_MAX + 1 ... QUIC_VARINT_2_BYTE_MAX: + return 2; + case QUIC_VARINT_2_BYTE_MAX + 1 ... QUIC_VARINT_4_BYTE_MAX: + return 4; + case QUIC_VARINT_4_BYTE_MAX + 1 ... QUIC_VARINT_8_BYTE_MAX: + return 8; + default: + return 0; + } +} + +/* Returns the maximum value of a QUIC variable-length integer with <sz> as size */ +static inline uint64_t quic_max_int(size_t sz) +{ + switch (sz) { + case 1: + return QUIC_VARINT_1_BYTE_MAX; + case 2: + return QUIC_VARINT_2_BYTE_MAX; + case 4: + return QUIC_VARINT_4_BYTE_MAX; + case 8: + return QUIC_VARINT_8_BYTE_MAX; + } + + return -1; +} + +/* Decode a QUIC variable-length integer from <buf> buffer into <val>. + * Note that the result is a 64-bits integer but with the less significant + * 62 bits as relevant information. The most significant 2 remaining bits encode + * the length of the integer. + * Returns 1 if succeeded there was enough data in <buf>), 0 if not. + */ +static inline int quic_dec_int(uint64_t *val, + const unsigned char **buf, + const unsigned char *end) +{ + size_t len; + + if (*buf >= end) + return 0; + + len = 1 << (**buf >> QUIC_VARINT_BYTE_0_SHIFT); + if (*buf + len > end) + return 0; + + *val = *(*buf)++ & QUIC_VARINT_BYTE_0_BITMASK; + while (--len) + *val = (*val << 8) | *(*buf)++; + + return 1; +} + +/* Decode a QUIC variable-length integer from <b> buffer into <val> supporting wrapping. + * Note that the result is a 64-bits integer but with the less significant + * 62 bits as relevant information. The most significant 2 bits encode + * the length of the integer. + * Note that this function update <b> buffer when a variable-length integer + * has successfully been parsed. + * Returns 1 and if succeeded (there was enough data in <buf>), 0 if not. + * If <retlen> is not null, increment <*retlen> by the number of bytes consumed to decode + * the varint. + */ +static inline size_t b_quic_dec_int(uint64_t *val, struct buffer *b, size_t *retlen) +{ + const unsigned char *pos = (const unsigned char *)b_head(b); + const unsigned char *end = (const unsigned char *)b_wrap(b); + size_t size = b_size(b); + size_t data = b_data(b); + size_t save_len, len; + + if (!data) + return 0; + + save_len = len = 1 << (*pos >> QUIC_VARINT_BYTE_0_SHIFT); + if (data < len) + return 0; + + *val = *pos & QUIC_VARINT_BYTE_0_BITMASK; + if (++pos == end) + pos -= size; + while (--len) { + *val = (*val << 8) | *pos; + if (++pos == end) + pos -= size; + } + if (retlen) + *retlen += save_len; + b_del(b, save_len); + + return 1; +} + +/* Encode a QUIC variable-length integer from <val> into <buf> buffer with <end> as first + * byte address after the end of this buffer. + * Returns 1 if succeeded (there was enough room in buf), 0 if not. + */ +static inline int quic_enc_int(unsigned char **buf, const unsigned char *end, uint64_t val) +{ + size_t len; + unsigned int shift; + unsigned char size_bits, *head; + + len = quic_int_getsize(val); + if (!len || end - *buf < len) + return 0; + + shift = (len - 1) * 8; + /* set the bits of byte#0 which gives the length of the encoded integer */ + size_bits = quic_log2(len) << QUIC_VARINT_BYTE_0_SHIFT; + head = *buf; + while (len--) { + *(*buf)++ = val >> shift; + shift -= 8; + } + *head |= size_bits; + + return 1; +} + +/* Encode a QUIC variable-length integer <val> into <b> buffer. <width> can be + * set to specify the desired output width. By default use 0 for the minimal + * integer size. Other valid values are 1, 2, 4 or 8. + * + * Returns 1 on success else 0. + */ +static inline int b_quic_enc_int(struct buffer *b, uint64_t val, int width) +{ + char *pos; + int save_width, len; + + /* width can only by 0, 1, 2, 4 or 8 */ + BUG_ON(width && (width > 8 || atleast2(width))); + + len = quic_int_getsize(val); + if (!len) + return 0; + + /* Check that buffer room is sufficient and width big enough if set. */ + if (b_room(b) < len || (width && width < len)) + return 0; + + if (!width) + width = len; + save_width = width; + + pos = b_tail(b); + while (width--) { + /* Encode the shifted integer or 0 if width bigger than integer length. */ + *pos++ = width >= len ? 0 : val >> (width * 8); + + if (pos == b_wrap(b)) + pos = b_orig(b); + } + + /* set the bits of byte#0 which gives the length of the encoded integer */ + *b_tail(b) |= quic_log2(save_width) << QUIC_VARINT_BYTE_0_SHIFT; + b_add(b, save_width); + + return 1; +} + +static inline size_t quic_incint_size_diff(uint64_t val) +{ + switch (val) { + case QUIC_VARINT_1_BYTE_MAX: + return 1; + case QUIC_VARINT_2_BYTE_MAX: + return 2; + case QUIC_VARINT_4_BYTE_MAX: + return 4; + default: + return 0; + } +} + +/* Return the difference between the encoded length of <val> and the encoded + * length of <val-1>. + */ +static inline size_t quic_decint_size_diff(uint64_t val) +{ + switch (val) { + case QUIC_VARINT_1_BYTE_MAX + 1: + return 1; + case QUIC_VARINT_2_BYTE_MAX + 1: + return 2; + case QUIC_VARINT_4_BYTE_MAX + 1: + return 4; + default: + return 0; + } +} + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_ENC_H */ diff --git a/include/haproxy/quic_frame-t.h b/include/haproxy/quic_frame-t.h new file mode 100644 index 0000000..5e91f93 --- /dev/null +++ b/include/haproxy/quic_frame-t.h @@ -0,0 +1,309 @@ +/* + * include/types/quic_frame.h + * This file contains QUIC frame definitions. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _TYPES_QUIC_FRAME_H +#define _TYPES_QUIC_FRAME_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> +#include <stdlib.h> + +#include <import/ebtree-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/list.h> +#include <haproxy/quic_stream-t.h> + +extern struct pool_head *pool_head_quic_frame; +extern struct pool_head *pool_head_qf_crypto; + +/* forward declarations from xprt-quic */ +struct quic_arngs; +struct quic_enc_level; +struct quic_tx_packet; + +/* QUIC frame types. */ +enum quic_frame_type { + QUIC_FT_PADDING = 0x00, + QUIC_FT_PING = 0x01, + QUIC_FT_ACK = 0x02, + QUIC_FT_ACK_ECN = 0x03, + QUIC_FT_RESET_STREAM = 0x04, + QUIC_FT_STOP_SENDING = 0x05, + QUIC_FT_CRYPTO = 0x06, + QUIC_FT_NEW_TOKEN = 0x07, + + QUIC_FT_STREAM_8 = 0x08, + QUIC_FT_STREAM_9 = 0x09, + QUIC_FT_STREAM_A = 0x0a, + QUIC_FT_STREAM_B = 0x0b, + QUIC_FT_STREAM_C = 0x0c, + QUIC_FT_STREAM_D = 0x0d, + QUIC_FT_STREAM_E = 0x0e, + QUIC_FT_STREAM_F = 0x0f, + + QUIC_FT_MAX_DATA = 0x10, + QUIC_FT_MAX_STREAM_DATA = 0x11, + QUIC_FT_MAX_STREAMS_BIDI = 0x12, + QUIC_FT_MAX_STREAMS_UNI = 0x13, + QUIC_FT_DATA_BLOCKED = 0x14, + QUIC_FT_STREAM_DATA_BLOCKED = 0x15, + QUIC_FT_STREAMS_BLOCKED_BIDI = 0x16, + QUIC_FT_STREAMS_BLOCKED_UNI = 0x17, + QUIC_FT_NEW_CONNECTION_ID = 0x18, + QUIC_FT_RETIRE_CONNECTION_ID = 0x19, + QUIC_FT_PATH_CHALLENGE = 0x1a, + QUIC_FT_PATH_RESPONSE = 0x1b, + QUIC_FT_CONNECTION_CLOSE = 0x1c, + QUIC_FT_CONNECTION_CLOSE_APP = 0x1d, + QUIC_FT_HANDSHAKE_DONE = 0x1e, + /* Do not insert enums after the following one. */ + QUIC_FT_MAX +}; + +#define QUIC_FT_PKT_TYPE_I_BITMASK (1 << QUIC_PACKET_TYPE_INITIAL) +#define QUIC_FT_PKT_TYPE_0_BITMASK (1 << QUIC_PACKET_TYPE_0RTT) +#define QUIC_FT_PKT_TYPE_H_BITMASK (1 << QUIC_PACKET_TYPE_HANDSHAKE) +#define QUIC_FT_PKT_TYPE_1_BITMASK (1 << QUIC_PACKET_TYPE_SHORT) + +#define QUIC_FT_PKT_TYPE_IH01_BITMASK \ + (QUIC_FT_PKT_TYPE_I_BITMASK | QUIC_FT_PKT_TYPE_H_BITMASK | \ + QUIC_FT_PKT_TYPE_0_BITMASK | QUIC_FT_PKT_TYPE_1_BITMASK) + +#define QUIC_FT_PKT_TYPE_IH_1_BITMASK \ + (QUIC_FT_PKT_TYPE_I_BITMASK | QUIC_FT_PKT_TYPE_H_BITMASK | \ + QUIC_FT_PKT_TYPE_1_BITMASK) + +#define QUIC_FT_PKT_TYPE___01_BITMASK \ + (QUIC_FT_PKT_TYPE_0_BITMASK | QUIC_FT_PKT_TYPE_1_BITMASK) + +#define QUIC_FT_PKT_TYPE____1_BITMASK QUIC_FT_PKT_TYPE_1_BITMASK + + +/* Flag a TX frame as acknowledged */ +#define QUIC_FL_TX_FRAME_ACKED 0x01 + +#define QUIC_STREAM_FRAME_TYPE_FIN_BIT 0x01 +#define QUIC_STREAM_FRAME_TYPE_LEN_BIT 0x02 +#define QUIC_STREAM_FRAME_TYPE_OFF_BIT 0x04 + +/* Servers have the stream initiator bit set. */ +#define QUIC_STREAM_FRAME_ID_INITIATOR_BIT 0x01 +/* Unidirectional streams have the direction bit set. */ +#define QUIC_STREAM_FRAME_ID_DIR_BIT 0x02 + +#define QUIC_PATH_CHALLENGE_LEN 8 +/* Maximum phrase length in CONNECTION_CLOSE frame */ +#define QUIC_CC_REASON_PHRASE_MAXLEN 64 + +struct qf_padding { + size_t len; +}; + +struct qf_ack { + uint64_t largest_ack; + uint64_t ack_delay; + uint64_t ack_range_num; + uint64_t first_ack_range; +}; + +/* Structure used when emitting ACK frames. */ +struct qf_tx_ack { + uint64_t ack_delay; + struct quic_arngs *arngs; +}; + +struct qf_reset_stream { + uint64_t id; + uint64_t app_error_code; + uint64_t final_size; +}; + +struct qf_stop_sending { + uint64_t id; + uint64_t app_error_code; +}; + +struct qf_crypto { + struct list list; + uint64_t offset; + uint64_t len; + const struct quic_enc_level *qel; + const unsigned char *data; +}; + +struct qf_new_token { + uint64_t len; + const unsigned char *data; +}; + +struct qf_stream { + uint64_t id; + struct qc_stream_desc *stream; + + /* used only on TX when constructing frames. + * Data cleared when processing ACK related to this STREAM frame. + * + * A same buffer may be shared between several STREAM frames. The + * <data> field of each quic_stream serves to differentiate the payload + * of each of these. + */ + struct buffer *buf; + + struct eb64_node offset; + uint64_t len; + + /* for TX pointer into <buf> field. + * for RX pointer into the packet buffer. + */ + const unsigned char *data; + + char dup; /* set for duplicated frame : this forces to check for the underlying qc_stream_buf instance before emitting it. */ +}; + +struct qf_max_data { + uint64_t max_data; +}; + +struct qf_max_stream_data { + uint64_t id; + uint64_t max_stream_data; +}; + +struct qf_max_streams { + uint64_t max_streams; +}; + +struct qf_data_blocked { + uint64_t limit; +}; + +struct qf_stream_data_blocked { + uint64_t id; + uint64_t limit; +}; + +struct qf_streams_blocked { + uint64_t limit; +}; + +struct qf_new_connection_id { + uint64_t seq_num; + uint64_t retire_prior_to; + struct { + unsigned char len; + const unsigned char *data; + } cid; + const unsigned char *stateless_reset_token; +}; + +struct qf_retire_connection_id { + uint64_t seq_num; +}; + +struct qf_path_challenge { + unsigned char data[QUIC_PATH_CHALLENGE_LEN]; +}; + +struct qf_path_challenge_response { + unsigned char data[QUIC_PATH_CHALLENGE_LEN]; +}; + +struct qf_connection_close { + uint64_t error_code; + uint64_t frame_type; + uint64_t reason_phrase_len; + unsigned char reason_phrase[QUIC_CC_REASON_PHRASE_MAXLEN]; +}; + +struct qf_connection_close_app { + uint64_t error_code; + uint64_t reason_phrase_len; + unsigned char reason_phrase[QUIC_CC_REASON_PHRASE_MAXLEN]; +}; + +struct quic_frame { + struct list list; /* List elem from parent elem (typically a Tx packet instance, a PKTNS or a MUX element). */ + struct quic_tx_packet *pkt; /* Last Tx packet used to send the frame. */ + unsigned char type; /* QUIC frame type. */ + union { + struct qf_padding padding; + struct qf_ack ack; + struct qf_tx_ack tx_ack; + struct qf_crypto crypto; + struct qf_reset_stream reset_stream; + struct qf_stop_sending stop_sending; + struct qf_new_token new_token; + struct qf_stream stream; + struct qf_max_data max_data; + struct qf_max_stream_data max_stream_data; + struct qf_max_streams max_streams_bidi; + struct qf_max_streams max_streams_uni; + struct qf_data_blocked data_blocked; + struct qf_stream_data_blocked stream_data_blocked; + struct qf_streams_blocked streams_blocked_bidi; + struct qf_streams_blocked streams_blocked_uni; + struct qf_new_connection_id new_connection_id; + struct qf_retire_connection_id retire_connection_id; + struct qf_path_challenge path_challenge; + struct qf_path_challenge_response path_challenge_response; + struct qf_connection_close connection_close; + struct qf_connection_close_app connection_close_app; + }; + struct quic_frame *origin; /* Parent frame. Set if frame is a duplicate (used for retransmission). */ + struct list reflist; /* List head containing duplicated children frames. */ + struct list ref; /* List elem from parent frame reflist. Set if frame is a duplicate (used for retransmission). */ + unsigned int flags; /* QUIC_FL_TX_FRAME_* */ + unsigned int loss_count; /* Counter for each occurrence of this frame marked as lost. */ +}; + + +/* QUIC error codes */ +struct quic_err { + uint64_t code; /* error code */ + int app; /* set for Application error code */ +}; + +/* Transport level error codes. */ +#define QC_ERR_NO_ERROR 0x00 +#define QC_ERR_INTERNAL_ERROR 0x01 +#define QC_ERR_CONNECTION_REFUSED 0x02 +#define QC_ERR_FLOW_CONTROL_ERROR 0x03 +#define QC_ERR_STREAM_LIMIT_ERROR 0x04 +#define QC_ERR_STREAM_STATE_ERROR 0x05 +#define QC_ERR_FINAL_SIZE_ERROR 0x06 +#define QC_ERR_FRAME_ENCODING_ERROR 0x07 +#define QC_ERR_TRANSPORT_PARAMETER_ERROR 0x08 +#define QC_ERR_CONNECTION_ID_LIMIT_ERROR 0x09 +#define QC_ERR_PROTOCOL_VIOLATION 0x0a +#define QC_ERR_INVALID_TOKEN 0x0b +#define QC_ERR_APPLICATION_ERROR 0x0c +#define QC_ERR_CRYPTO_BUFFER_EXCEEDED 0x0d +#define QC_ERR_KEY_UPDATE_ERROR 0x0e +#define QC_ERR_AEAD_LIMIT_REACHED 0x0f +#define QC_ERR_NO_VIABLE_PATH 0x10 +/* 256 TLS reserved errors 0x100-0x1ff. */ +#define QC_ERR_CRYPTO_ERROR 0x100 + +#endif /* USE_QUIC */ +#endif /* _TYPES_QUIC_FRAME_H */ diff --git a/include/haproxy/quic_frame.h b/include/haproxy/quic_frame.h new file mode 100644 index 0000000..90d6b21 --- /dev/null +++ b/include/haproxy/quic_frame.h @@ -0,0 +1,281 @@ +/* + * include/haproxy/quic_frame.h + * This file contains prototypes for QUIC frames. + * + * Copyright 2020 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_FRAME_H +#define _HAPROXY_QUIC_FRAME_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <import/eb64tree.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_enc.h> +#include <haproxy/quic_frame-t.h> +#include <haproxy/quic_rx-t.h> + +const char *quic_frame_type_string(enum quic_frame_type ft); + +int qc_build_frm(unsigned char **pos, const unsigned char *end, + struct quic_frame *frm, struct quic_tx_packet *pkt, + struct quic_conn *conn); + +int qc_parse_frm(struct quic_frame *frm, struct quic_rx_packet *pkt, + const unsigned char **pos, const unsigned char *end, + struct quic_conn *conn); + +void qc_release_frm(struct quic_conn *qc, struct quic_frame *frm); + +/* Return the length of <frm> frame if succeeded, -1 if not (unknown frames + * or which must not be transmitted again after having been lost (PING, PADDING). + */ +static inline size_t qc_frm_len(struct quic_frame *frm) +{ + size_t len = 0; + + switch (frm->type) { + case QUIC_FT_ACK: { + struct qf_tx_ack *tx_ack = &frm->tx_ack; + struct eb64_node *ar, *prev_ar; + struct quic_arng_node *ar_node, *prev_ar_node; + + ar = eb64_last(&tx_ack->arngs->root); + ar_node = eb64_entry(ar, struct quic_arng_node, first); + len += 1 + quic_int_getsize(ar_node->last); + len += quic_int_getsize(tx_ack->ack_delay); + len += quic_int_getsize(tx_ack->arngs->sz - 1); + len += quic_int_getsize(ar_node->last - ar_node->first.key); + + while ((prev_ar = eb64_prev(ar))) { + prev_ar_node = eb64_entry(prev_ar, struct quic_arng_node, first); + len += quic_int_getsize(ar_node->first.key - prev_ar_node->last - 2); + len += quic_int_getsize(prev_ar_node->last - prev_ar_node->first.key); + ar = prev_ar; + ar_node = eb64_entry(ar, struct quic_arng_node, first); + } + break; + } + case QUIC_FT_RESET_STREAM: { + struct qf_reset_stream *f = &frm->reset_stream; + len += 1 + quic_int_getsize(f->id) + + quic_int_getsize(f->app_error_code) + quic_int_getsize(f->final_size); + break; + } + case QUIC_FT_STOP_SENDING: { + struct qf_stop_sending *f = &frm->stop_sending; + len += 1 + quic_int_getsize(f->id) + quic_int_getsize(f->app_error_code); + break; + } + case QUIC_FT_CRYPTO: { + struct qf_crypto *f = &frm->crypto; + len += 1 + quic_int_getsize(f->offset) + quic_int_getsize(f->len) + f->len; + break; + } + case QUIC_FT_NEW_TOKEN: { + struct qf_new_token *f = &frm->new_token; + len += 1 + quic_int_getsize(f->len) + f->len; + break; + } + case QUIC_FT_STREAM_8 ... QUIC_FT_STREAM_F: { + struct qf_stream *f = &frm->stream; + len += 1 + quic_int_getsize(f->id) + + ((frm->type & QUIC_STREAM_FRAME_TYPE_OFF_BIT) ? quic_int_getsize(f->offset.key) : 0) + + ((frm->type & QUIC_STREAM_FRAME_TYPE_LEN_BIT) ? quic_int_getsize(f->len) : 0) + f->len; + break; + } + case QUIC_FT_MAX_DATA: { + struct qf_max_data *f = &frm->max_data; + len += 1 + quic_int_getsize(f->max_data); + break; + } + case QUIC_FT_MAX_STREAM_DATA: { + struct qf_max_stream_data *f = &frm->max_stream_data; + len += 1 + quic_int_getsize(f->id) + quic_int_getsize(f->max_stream_data); + break; + } + case QUIC_FT_MAX_STREAMS_BIDI: { + struct qf_max_streams *f = &frm->max_streams_bidi; + len += 1 + quic_int_getsize(f->max_streams); + break; + } + case QUIC_FT_MAX_STREAMS_UNI: { + struct qf_max_streams *f = &frm->max_streams_uni; + len += 1 + quic_int_getsize(f->max_streams); + break; + } + case QUIC_FT_DATA_BLOCKED: { + struct qf_data_blocked *f = &frm->data_blocked; + len += 1 + quic_int_getsize(f->limit); + break; + } + case QUIC_FT_STREAM_DATA_BLOCKED: { + struct qf_stream_data_blocked *f = &frm->stream_data_blocked; + len += 1 + quic_int_getsize(f->id) + quic_int_getsize(f->limit); + break; + } + case QUIC_FT_STREAMS_BLOCKED_BIDI: { + struct qf_streams_blocked *f = &frm->streams_blocked_bidi; + len += 1 + quic_int_getsize(f->limit); + break; + } + case QUIC_FT_STREAMS_BLOCKED_UNI: { + struct qf_streams_blocked *f = &frm->streams_blocked_uni; + len += 1 + quic_int_getsize(f->limit); + break; + } + case QUIC_FT_NEW_CONNECTION_ID: { + struct qf_new_connection_id *f = &frm->new_connection_id; + len += 1 + quic_int_getsize(f->seq_num) + quic_int_getsize(f->retire_prior_to) + + quic_int_getsize(f->cid.len) + f->cid.len + QUIC_STATELESS_RESET_TOKEN_LEN; + break; + } + case QUIC_FT_RETIRE_CONNECTION_ID: { + struct qf_retire_connection_id *f = &frm->retire_connection_id; + len += 1 + quic_int_getsize(f->seq_num); + break; + } + case QUIC_FT_PATH_CHALLENGE: { + struct qf_path_challenge *f = &frm->path_challenge; + len += 1 + sizeof f->data; + break; + } + case QUIC_FT_PATH_RESPONSE: { + struct qf_path_challenge_response *f = &frm->path_challenge_response; + len += 1 + sizeof f->data; + break; + } + case QUIC_FT_CONNECTION_CLOSE: { + struct qf_connection_close *f = &frm->connection_close; + len += 1 + quic_int_getsize(f->error_code) + quic_int_getsize(f->frame_type) + + quic_int_getsize(f->reason_phrase_len) + f->reason_phrase_len; + break; + } + case QUIC_FT_CONNECTION_CLOSE_APP: { + struct qf_connection_close *f = &frm->connection_close; + len += 1 + quic_int_getsize(f->error_code) + + quic_int_getsize(f->reason_phrase_len) + f->reason_phrase_len; + break; + } + case QUIC_FT_HANDSHAKE_DONE: { + len += 1; + break; + } + default: + return -1; + } + + return len; +} + +static inline struct quic_err quic_err_transport(uint64_t code) +{ + return (struct quic_err){ .code = code, .app = 0 }; +} + +static inline struct quic_err quic_err_tls(uint64_t tls_alert) +{ + const uint64_t code = QC_ERR_CRYPTO_ERROR|tls_alert; + return (struct quic_err){ .code = code, .app = 0 }; +} + +static inline struct quic_err quic_err_app(uint64_t code) +{ + return (struct quic_err){ .code = code, .app = 1 }; +} + +/* Allocate a quic_frame with type <type>. Frame must be freed with + * qc_frm_free(). + * + * Returns the allocated frame or NULL on failure. + */ +static inline struct quic_frame *qc_frm_alloc(int type) +{ + struct quic_frame *frm = NULL; + + frm = pool_alloc(pool_head_quic_frame); + if (!frm) + return NULL; + + frm->type = type; + + LIST_INIT(&frm->list); + LIST_INIT(&frm->reflist); + LIST_INIT(&frm->ref); + frm->pkt = NULL; + frm->origin = NULL; + frm->flags = 0; + frm->loss_count = 0; + + return frm; +} + +/* Allocate a quic_frame by duplicating <origin> frame. This will create a new + * frame of the same type with the same content. Internal fields such as packet + * owner and flags are however reset for the newly allocated frame except + * for the loss counter. Frame must be freed with qc_frm_free(). + * + * Returns the allocated frame or NULL on failure. + */ +static inline struct quic_frame *qc_frm_dup(struct quic_frame *origin) +{ + struct quic_frame *frm = NULL; + + frm = pool_alloc(pool_head_quic_frame); + if (!frm) + return NULL; + + *frm = *origin; + + /* Reinit all internal members except loss_count. */ + LIST_INIT(&frm->list); + LIST_INIT(&frm->reflist); + frm->pkt = NULL; + frm->flags = 0; + + /* Attach <frm> to <origin>. */ + LIST_APPEND(&origin->reflist, &frm->ref); + frm->origin = origin; + + return frm; +} + +void qc_frm_free(struct quic_conn *qc, struct quic_frame **frm); +void qc_frm_unref(struct quic_frame *frm, struct quic_conn *qc); + +/* Move forward <strm> STREAM frame by <data> bytes. */ +static inline void qc_stream_frm_mv_fwd(struct quic_frame *frm, uint64_t data) +{ + struct qf_stream *strm_frm = &frm->stream; + struct buffer cf_buf; + + /* Set offset bit if not already there. */ + strm_frm->offset.key += data; + frm->type |= QUIC_STREAM_FRAME_TYPE_OFF_BIT; + + strm_frm->len -= data; + cf_buf = b_make(b_orig(strm_frm->buf), + b_size(strm_frm->buf), + (char *)strm_frm->data - b_orig(strm_frm->buf), 0); + strm_frm->data = (unsigned char *)b_peek(&cf_buf, data); +} + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_FRAME_H */ diff --git a/include/haproxy/quic_loss-t.h b/include/haproxy/quic_loss-t.h new file mode 100644 index 0000000..0f07ddc --- /dev/null +++ b/include/haproxy/quic_loss-t.h @@ -0,0 +1,62 @@ +/* + * include/types/quic_loss.h + * This file contains definitions for QUIC loss detection. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _TYPES_QUIC_LOSS_H +#define _TYPES_QUIC_LOSS_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> + +/* Maximum reordering in packets. */ +#define QUIC_LOSS_PACKET_THRESHOLD 3 +#define QUIC_TIMER_GRANULARITY 1U /* 1ms */ +#define QUIC_LOSS_INITIAL_RTT 333U /* 333ms */ + +/* QUIC loss time threshold expressed an RTT multiplier + * (QUIC_LOSS_TIME_THRESHOLD_MULTIPLICAND / QUIC_LOSS_TIME_THRESHOLD_DIVISOR) + */ +#define QUIC_LOSS_TIME_THRESHOLD_MULTIPLICAND 9 +#define QUIC_LOSS_TIME_THRESHOLD_DIVISOR 8 + +/* Note that all the unit of variables for QUIC LOSS detections + * is the tick. + */ + +struct quic_loss { + /* The most recent RTT measurement (ms) */ + unsigned int latest_rtt; + /* Smoothed RTT (ms) */ + unsigned int srtt; + /* RTT variation (ms) */ + unsigned int rtt_var; + /* Minimum RTT (ms) */ + unsigned int rtt_min; + /* Number of NACKed sent PTO. */ + unsigned int pto_count; + unsigned long nb_lost_pkt; + unsigned long nb_reordered_pkt; +}; + +#endif /* USE_QUIC */ +#endif /* _TYPES_QUIC_LOSS_H */ diff --git a/include/haproxy/quic_loss.h b/include/haproxy/quic_loss.h new file mode 100644 index 0000000..fc713ca --- /dev/null +++ b/include/haproxy/quic_loss.h @@ -0,0 +1,92 @@ +/* + * include/proto/quic_loss.h + * This file provides interface definition for QUIC loss detection. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _PROTO_QUIC_LOSS_H +#define _PROTO_QUIC_LOSS_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/quic_loss-t.h> + +#include <haproxy/api.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_tls-t.h> + +static inline void quic_loss_init(struct quic_loss *ql) +{ + ql->latest_rtt = 0; + ql->srtt = QUIC_LOSS_INITIAL_RTT; + ql->rtt_var = QUIC_LOSS_INITIAL_RTT / 2; + ql->rtt_min = 0; + ql->pto_count = 0; + ql->nb_lost_pkt = 0; + ql->nb_reordered_pkt = 0; +} + +/* Return 1 if a persistent congestion is observed for a list of + * lost packets sent during <period> period depending on <ql> loss information, + * <now_us> the current time and <max_ack_delay_us> the maximum ACK delay of the connection + * experiencing a packet loss. Return 0 on the contrary. + */ +static inline int quic_loss_persistent_congestion(struct quic_loss *ql, + unsigned int period, + unsigned int now_us, + unsigned int max_ack_delay) +{ + unsigned int congestion_period; + + if (!period) + return 0; + + congestion_period = ql->srtt + + QUIC_MAX(4 * ql->rtt_var, QUIC_TIMER_GRANULARITY) + max_ack_delay; + congestion_period *= QUIC_LOSS_PACKET_THRESHOLD; + + return period >= congestion_period; +} + +/* Return the PTO associated to <pktns> packet number space for <qc> connection */ +static inline unsigned int quic_pto(struct quic_conn *qc) +{ + struct quic_loss *ql = &qc->path->loss; + + return ql->srtt + QUIC_MAX(4 * ql->rtt_var, QUIC_TIMER_GRANULARITY) + + (HA_ATOMIC_LOAD(&qc->state) >= QUIC_HS_ST_COMPLETE ? qc->max_ack_delay : 0); +} + +void quic_loss_srtt_update(struct quic_loss *ql, + unsigned int rtt, unsigned int ack_delay, + struct quic_conn *qc); + +struct quic_pktns *quic_loss_pktns(struct quic_conn *qc); + +struct quic_pktns *quic_pto_pktns(struct quic_conn *qc, + int handshake_completed, + unsigned int *pto); + +void qc_packet_loss_lookup(struct quic_pktns *pktns, struct quic_conn *qc, + struct list *lost_pkts); +int qc_release_lost_pkts(struct quic_conn *qc, struct quic_pktns *pktns, + struct list *pkts, uint64_t now_us); +#endif /* USE_QUIC */ +#endif /* _PROTO_QUIC_LOSS_H */ diff --git a/include/haproxy/quic_openssl_compat-t.h b/include/haproxy/quic_openssl_compat-t.h new file mode 100644 index 0000000..2f2b92b --- /dev/null +++ b/include/haproxy/quic_openssl_compat-t.h @@ -0,0 +1,64 @@ +#ifndef _HAPROXY_QUIC_OPENSSL_COMPAT_T_H_ +#define _HAPROXY_QUIC_OPENSSL_COMPAT_T_H_ + +#ifdef USE_QUIC_OPENSSL_COMPAT +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#define QUIC_OPENSSL_COMPAT_TLS_SECRET_LEN 48 +#define QUIC_OPENSSL_COMPAT_TLS_IV_LEN 12 + +/* Highly inspired from nginx QUIC TLS compatibility code */ + +enum ssl_encryption_level_t { + ssl_encryption_initial = 0, + ssl_encryption_early_data, + ssl_encryption_handshake, + ssl_encryption_application +}; + +typedef struct ssl_quic_method_st { + int (*set_encryption_secrets)(SSL *ssl, enum ssl_encryption_level_t level, + const uint8_t *rsecret, const uint8_t *wsecret, + size_t secret_len); + int (*add_handshake_data)(SSL *ssl, enum ssl_encryption_level_t level, + const uint8_t *data, size_t len); + int (*flush_flight)(SSL *ssl); + int (*send_alert)(SSL *ssl, enum ssl_encryption_level_t level, + uint8_t alert); +} SSL_QUIC_METHOD; + +struct quic_tls_md { + unsigned char data[QUIC_OPENSSL_COMPAT_TLS_SECRET_LEN]; + size_t len; +}; + +struct quic_tls_iv { + unsigned char data[QUIC_OPENSSL_COMPAT_TLS_IV_LEN]; + size_t len; +}; + +struct quic_tls_secret { + struct quic_tls_md secret; + struct quic_tls_md key; + struct quic_tls_iv iv; +}; + +struct quic_tls_compat_keys { + struct quic_tls_secret secret; + const EVP_CIPHER *cipher; +}; + +struct quic_openssl_compat { + BIO *rbio; + BIO *wbio; + const SSL_QUIC_METHOD *method; + enum ssl_encryption_level_t write_level; + enum ssl_encryption_level_t read_level; + uint64_t read_record; + struct quic_tls_compat_keys keys; +}; + +#endif /* USE_QUIC_OPENSSL_COMPAT */ +#endif /* _HAPROXY_QUIC_OPENSSL_COMPAT_T_H_ */ diff --git a/include/haproxy/quic_openssl_compat.h b/include/haproxy/quic_openssl_compat.h new file mode 100644 index 0000000..837a28d --- /dev/null +++ b/include/haproxy/quic_openssl_compat.h @@ -0,0 +1,33 @@ +#ifndef _HAPROXY_QUIC_OPENSSL_COMPAT_H_ +#define _HAPROXY_QUIC_OPENSSL_COMPAT_H_ + +#ifdef USE_QUIC_OPENSSL_COMPAT + +/* Highly inspired from nginx QUIC TLS compatibility code */ +#include <haproxy/listener-t.h> +#include <haproxy/quic_openssl_compat-t.h> + +#define QUIC_OPENSSL_COMPAT_SSL_TP_EXT 0x39 + +/* Used by keylog */ +#define QUIC_OPENSSL_COMPAT_CLIENT_HANDSHAKE "CLIENT_HANDSHAKE_TRAFFIC_SECRET" +#define QUIC_OPENSSL_COMPAT_SERVER_HANDSHAKE "SERVER_HANDSHAKE_TRAFFIC_SECRET" +#define QUIC_OPENSSL_COMPAT_CLIENT_APPLICATION "CLIENT_TRAFFIC_SECRET_0" +#define QUIC_OPENSSL_COMPAT_SERVER_APPLICATION "SERVER_TRAFFIC_SECRET_0" + +void quic_tls_compat_msg_callback(struct connection *conn, + int write_p, int version, int content_type, + const void *buf, size_t len, SSL *ssl); +int quic_tls_compat_init(struct bind_conf *bind_conf, SSL_CTX *ctx); +void quic_tls_compat_keylog_callback(const SSL *ssl, const char *line); + +int SSL_set_quic_method(SSL *ssl, const SSL_QUIC_METHOD *quic_method); +enum ssl_encryption_level_t SSL_quic_read_level(const SSL *ssl); +enum ssl_encryption_level_t SSL_quic_write_level(const SSL *ssl); +int SSL_set_quic_transport_params(SSL *ssl, const uint8_t *params, size_t params_len); +int SSL_provide_quic_data(SSL *ssl, enum ssl_encryption_level_t level, + const uint8_t *data, size_t len); +int SSL_process_quic_post_handshake(SSL *ssl); + +#endif /* USE_QUIC_OPENSSL_COMPAT */ +#endif /* _HAPROXY_QUIC_OPENSSL_COMPAT_H_ */ diff --git a/include/haproxy/quic_retransmit.h b/include/haproxy/quic_retransmit.h new file mode 100644 index 0000000..403a53c --- /dev/null +++ b/include/haproxy/quic_retransmit.h @@ -0,0 +1,20 @@ +#ifndef _HAPROXY_QUIC_RETRANSMIT_H +#define _HAPROXY_QUIC_RETRANSMIT_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/list-t.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_tls-t.h> + +void qc_prep_fast_retrans(struct quic_conn *qc, + struct quic_pktns *pktns, + struct list *frms1, struct list *frms2); +void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, + struct list *ifrms, struct list *hfrms); + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_RETRANSMIT_H */ diff --git a/include/haproxy/quic_retry.h b/include/haproxy/quic_retry.h new file mode 100644 index 0000000..d31be02 --- /dev/null +++ b/include/haproxy/quic_retry.h @@ -0,0 +1,33 @@ +#ifndef _HAPROXY_QUIC_RETRY_H +#define _HAPROXY_QUIC_RETRY_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> +#include <sys/socket.h> + +#include <haproxy/quic_cid-t.h> +#include <haproxy/quic_rx-t.h> +#include <haproxy/quic_sock-t.h> + +struct listener; + +int quic_generate_retry_token(unsigned char *token, size_t len, + const uint32_t version, + const struct quic_cid *odcid, + const struct quic_cid *dcid, + struct sockaddr_storage *addr); +int parse_retry_token(struct quic_conn *qc, + const unsigned char *token, const unsigned char *end, + struct quic_cid *odcid); +int quic_retry_token_check(struct quic_rx_packet *pkt, + struct quic_dgram *dgram, + struct listener *l, + struct quic_conn *qc, + struct quic_cid *odcid); + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_RETRY_H */ diff --git a/include/haproxy/quic_rx-t.h b/include/haproxy/quic_rx-t.h new file mode 100644 index 0000000..9ef8e7a --- /dev/null +++ b/include/haproxy/quic_rx-t.h @@ -0,0 +1,54 @@ +#ifndef _HAPROXY_RX_T_H +#define _HAPROXY_RX_T_H + +extern struct pool_head *pool_head_quic_conn_rxbuf; +extern struct pool_head *pool_head_quic_dgram; +extern struct pool_head *pool_head_quic_rx_packet; + +/* Maximum number of ack-eliciting received packets since the last + * ACK frame was sent + */ +#define QUIC_MAX_RX_AEPKTS_SINCE_LAST_ACK 2 +#define QUIC_ACK_DELAY (QUIC_TP_DFLT_MAX_ACK_DELAY - 5) +/* Flag a received packet as being an ack-eliciting packet. */ +#define QUIC_FL_RX_PACKET_ACK_ELICITING (1UL << 0) +/* Packet is the first one in the containing datagram. */ +#define QUIC_FL_RX_PACKET_DGRAM_FIRST (1UL << 1) +/* Spin bit set */ +#define QUIC_FL_RX_PACKET_SPIN_BIT (1UL << 2) + +struct quic_rx_packet { + struct list list; + struct list qc_rx_pkt_list; + + /* QUIC version used in packet. */ + const struct quic_version *version; + + unsigned char type; + /* Initial desctination connection ID. */ + struct quic_cid dcid; + struct quic_cid scid; + /* Packet number offset : only valid for Initial/Handshake/0-RTT/1-RTT. */ + size_t pn_offset; + /* Packet number */ + int64_t pn; + /* Packet number length */ + uint32_t pnl; + uint64_t token_len; + unsigned char *token; + /* Packet length */ + uint64_t len; + /* Packet length before decryption */ + uint64_t raw_len; + /* Additional authenticated data length */ + size_t aad_len; + unsigned char *data; + struct eb64_node pn_node; + volatile unsigned int refcnt; + /* Source address of this packet. */ + struct sockaddr_storage saddr; + unsigned int flags; + unsigned int time_received; +}; + +#endif /* _HAPROXY_RX_T_H */ diff --git a/include/haproxy/quic_rx.h b/include/haproxy/quic_rx.h new file mode 100644 index 0000000..494bc4a --- /dev/null +++ b/include/haproxy/quic_rx.h @@ -0,0 +1,58 @@ +/* + * QUIC protocol definitions (RX side). + * + * Copyright (C) 2023 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_RX_H +#define _HAPROXY_QUIC_RX_H + +#include <haproxy/listener-t.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_rx-t.h> + +int quic_dgram_parse(struct quic_dgram *dgram, struct quic_conn *from_qc, + struct listener *li); +int qc_treat_rx_pkts(struct quic_conn *qc); +int qc_parse_hd_form(struct quic_rx_packet *pkt, + unsigned char **pos, const unsigned char *end); +int qc_treat_rx_crypto_frms(struct quic_conn *qc, struct quic_enc_level *el, + struct ssl_sock_ctx *ctx); +int qc_handle_frms_of_lost_pkt(struct quic_conn *qc, + struct quic_tx_packet *pkt, + struct list *pktns_frm_list); + +/* Increment the reference counter of <pkt> */ +static inline void quic_rx_packet_refinc(struct quic_rx_packet *pkt) +{ + pkt->refcnt++; +} + +/* Decrement the reference counter of <pkt> while remaining positive */ +static inline void quic_rx_packet_refdec(struct quic_rx_packet *pkt) +{ + if (pkt->refcnt) + pkt->refcnt--; +} + +/* Return 1 if <pkt> header form is long, 0 if not. */ +static inline int qc_pkt_long(const struct quic_rx_packet *pkt) +{ + return pkt->type != QUIC_PACKET_TYPE_SHORT; +} + +#endif /* _HAPROXY_QUIC_RX_H */ diff --git a/include/haproxy/quic_sock-t.h b/include/haproxy/quic_sock-t.h new file mode 100644 index 0000000..67a5749 --- /dev/null +++ b/include/haproxy/quic_sock-t.h @@ -0,0 +1,50 @@ +#ifndef _HAPROXY_QUIC_SOCK_T_H +#define _HAPROXY_QUIC_SOCK_T_H +#ifdef USE_QUIC + +#include <haproxy/buf-t.h> + +/* QUIC socket allocation strategy. */ +enum quic_sock_mode { + QUIC_SOCK_MODE_CONN, /* Use a dedicated socket per connection. */ + QUIC_SOCK_MODE_LSTNR, /* Multiplex connections over listener socket. */ +}; + +/* QUIC connection accept queue. One per thread. */ +struct quic_accept_queue { + struct mt_list listeners; /* QUIC listeners with at least one connection ready to be accepted on this queue */ + struct tasklet *tasklet; /* task responsible to call listener_accept */ +}; + +/* Buffer used to receive QUIC datagrams on random thread and redispatch them + * to the connection thread. + */ +struct quic_receiver_buf { + struct buffer buf; /* storage for datagrams received. */ + struct list dgram_list; /* datagrams received with this rxbuf. */ + struct mt_list rxbuf_el; /* list element into receiver.rxbuf_list. */ +}; + +/* QUIC datagram */ +struct quic_dgram { + void *owner; + unsigned char *buf; + size_t len; + unsigned char *dcid; + size_t dcid_len; + struct sockaddr_storage saddr; + struct sockaddr_storage daddr; + struct quic_conn *qc; + + struct list recv_list; /* elemt to quic_receiver_buf <dgram_list>. */ + struct mt_list handler_list; /* elem to quic_dghdlr <dgrams>. */ +}; + +/* QUIC datagram handler */ +struct quic_dghdlr { + struct mt_list dgrams; + struct tasklet *task; +}; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_SOCK_T_H */ diff --git a/include/haproxy/quic_sock.h b/include/haproxy/quic_sock.h new file mode 100644 index 0000000..531cf62 --- /dev/null +++ b/include/haproxy/quic_sock.h @@ -0,0 +1,107 @@ +/* + * include/haproxy/quic_sock.h + * This file contains declarations for QUIC sockets. + * + * Copyright 2020 Frederic Lecaille <flecaille@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_SOCK_H +#define _HAPROXY_QUIC_SOCK_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <sys/socket.h> +#include <sys/types.h> + +#include <haproxy/api.h> +#include <haproxy/connection-t.h> +#include <haproxy/listener-t.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_sock-t.h> + +int quic_session_accept(struct connection *cli_conn); +int quic_sock_get_src(struct connection *conn, struct sockaddr *addr, socklen_t len); +int quic_sock_get_dst(struct connection *conn, struct sockaddr *addr, socklen_t len); +int quic_sock_accepting_conn(const struct receiver *rx); +struct connection *quic_sock_accept_conn(struct listener *l, int *status); + +struct task *quic_lstnr_dghdlr(struct task *t, void *ctx, unsigned int state); +void quic_lstnr_sock_fd_iocb(int fd); +int qc_snd_buf(struct quic_conn *qc, const struct buffer *buf, size_t count, + int flags); +int qc_rcv_buf(struct quic_conn *qc); +void quic_conn_sock_fd_iocb(int fd); + +void qc_alloc_fd(struct quic_conn *qc, const struct sockaddr_storage *src, + const struct sockaddr_storage *dst); +void qc_release_fd(struct quic_conn *qc, int reinit); +void qc_want_recv(struct quic_conn *qc); + +void quic_accept_push_qc(struct quic_conn *qc); + +int quic_listener_max_handshake(const struct listener *l); +int quic_listener_max_accept(const struct listener *l); + +/* Set default value for <qc> socket as uninitialized. */ +static inline void qc_init_fd(struct quic_conn *qc) +{ + qc->fd = -1; +} + +/* Returns true if <qc> socket is initialized else false. */ +static inline char qc_test_fd(struct quic_conn *qc) +{ + /* quic-conn socket should not be accessed once it has been released. */ + BUG_ON(qc->fd == DEAD_FD_MAGIC); + return qc->fd >= 0; +} + +/* Try to increment <l> handshake current counter. If listener limit is + * reached, incrementation is rejected and 0 is returned. + */ +static inline int quic_increment_curr_handshake(struct listener *l) +{ + unsigned int count, next; + const int max = quic_listener_max_handshake(l); + + do { + count = l->rx.quic_curr_handshake; + if (count >= max) { + /* maxconn reached */ + next = 0; + goto end; + } + + /* try to increment quic_curr_handshake */ + next = count + 1; + } while (!_HA_ATOMIC_CAS(&l->rx.quic_curr_handshake, &count, next) && __ha_cpu_relax()); + + end: + return next; +} + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_SOCK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/quic_ssl-t.h b/include/haproxy/quic_ssl-t.h new file mode 100644 index 0000000..3c057c6 --- /dev/null +++ b/include/haproxy/quic_ssl-t.h @@ -0,0 +1,21 @@ +/* + * include/haproxy/quic_ssl-t.h + * Definitions for QUIC over TLS/SSL api types, constants and flags. + * + * Copyright (C) 2023 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_QUIC_SSL_T_H +#define _HAPROXY_QUIC_SSL_T_H + +#include <haproxy/pool-t.h> + +extern struct pool_head *pool_head_quic_ssl_sock_ctx; + +#endif /* _HAPROXY_QUIC_SSL_T_H */ diff --git a/include/haproxy/quic_ssl.h b/include/haproxy/quic_ssl.h new file mode 100644 index 0000000..8f7df47 --- /dev/null +++ b/include/haproxy/quic_ssl.h @@ -0,0 +1,55 @@ +/* + * include/haproxy/quic_ssl.h + * This file contains QUIC over TLS/SSL api definitions. + * + * Copyright (C) 2023 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _HAPROXY_QUIC_SSL_H +#define _HAPROXY_QUIC_SSL_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/listener-t.h> +#include <haproxy/ncbuf-t.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/pool.h> +#include <haproxy/quic_ssl-t.h> +#include <haproxy/ssl_sock-t.h> + +int ssl_quic_initial_ctx(struct bind_conf *bind_conf); +int qc_alloc_ssl_sock_ctx(struct quic_conn *qc); +int qc_ssl_provide_quic_data(struct ncbuf *ncbuf, + enum ssl_encryption_level_t level, + struct ssl_sock_ctx *ctx, + const unsigned char *data, size_t len); +int qc_ssl_provide_all_quic_data(struct quic_conn *qc, struct ssl_sock_ctx *ctx); + +static inline void qc_free_ssl_sock_ctx(struct ssl_sock_ctx **ctx) +{ + if (!*ctx) + return; + + SSL_free((*ctx)->ssl); + pool_free(pool_head_quic_ssl_sock_ctx, *ctx); + *ctx = NULL; +} + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_SSL_H */ diff --git a/include/haproxy/quic_stats-t.h b/include/haproxy/quic_stats-t.h new file mode 100644 index 0000000..1ee6265 --- /dev/null +++ b/include/haproxy/quic_stats-t.h @@ -0,0 +1,105 @@ +#ifndef _HAPROXY_QUIC_STATS_T_H +#define _HAPROXY_QUIC_STATS_T_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +extern struct stats_module quic_stats_module; + +enum { + QUIC_ST_RXBUF_FULL, + QUIC_ST_DROPPED_PACKET, + QUIC_ST_DROPPED_PACKET_BUFOVERRUN, + QUIC_ST_DROPPED_PARSING, + QUIC_ST_SOCKET_FULL, + QUIC_ST_SENDTO_ERR, + QUIC_ST_SENDTO_ERR_UNKNWN, + QUIC_ST_SENT_PACKET, + QUIC_ST_LOST_PACKET, + QUIC_ST_TOO_SHORT_INITIAL_DGRAM, + QUIC_ST_RETRY_SENT, + QUIC_ST_RETRY_VALIDATED, + QUIC_ST_RETRY_ERRORS, + QUIC_ST_HALF_OPEN_CONN, + QUIC_ST_HDSHK_FAIL, + QUIC_ST_STATELESS_RESET_SENT, + /* Special events of interest */ + QUIC_ST_CONN_MIGRATION_DONE, + /* Transport errors */ + QUIC_ST_TRANSP_ERR_NO_ERROR, + QUIC_ST_TRANSP_ERR_INTERNAL_ERROR, + QUIC_ST_TRANSP_ERR_CONNECTION_REFUSED, + QUIC_ST_TRANSP_ERR_FLOW_CONTROL_ERROR, + QUIC_ST_TRANSP_ERR_STREAM_LIMIT_ERROR, + QUIC_ST_TRANSP_ERR_STREAM_STATE_ERROR, + QUIC_ST_TRANSP_ERR_FINAL_SIZE_ERROR, + QUIC_ST_TRANSP_ERR_FRAME_ENCODING_ERROR, + QUIC_ST_TRANSP_ERR_TRANSPORT_PARAMETER_ERROR, + QUIC_ST_TRANSP_ERR_CONNECTION_ID_LIMIT_ERROR, + QUIC_ST_TRANSP_ERR_PROTOCOL_VIOLATION, + QUIC_ST_TRANSP_ERR_INVALID_TOKEN, + QUIC_ST_TRANSP_ERR_APPLICATION_ERROR, + QUIC_ST_TRANSP_ERR_CRYPTO_BUFFER_EXCEEDED, + QUIC_ST_TRANSP_ERR_KEY_UPDATE_ERROR, + QUIC_ST_TRANSP_ERR_AEAD_LIMIT_REACHED, + QUIC_ST_TRANSP_ERR_NO_VIABLE_PATH, + QUIC_ST_TRANSP_ERR_CRYPTO_ERROR, + QUIC_ST_TRANSP_ERR_UNKNOWN_ERROR, + /* Stream related counters */ + QUIC_ST_DATA_BLOCKED, + QUIC_ST_STREAM_DATA_BLOCKED, + QUIC_ST_STREAMS_BLOCKED_BIDI, + QUIC_ST_STREAMS_BLOCKED_UNI, + QUIC_STATS_COUNT /* must be the last */ +}; + +struct quic_counters { + long long rxbuf_full; /* receive operation cancelled due to full buffer */ + long long dropped_pkt; /* total number of dropped packets */ + long long dropped_pkt_bufoverrun;/* total number of dropped packets because of buffer overrun */ + long long dropped_parsing; /* total number of dropped packets upon parsing errors */ + long long socket_full; /* total number of EAGAIN errors on sendto() calls */ + long long sendto_err; /* total number of errors on sendto() calls, EAGAIN excepted */ + long long sendto_err_unknown; /* total number of errors on sendto() calls which are currently not supported */ + long long sent_pkt; /* total number of sent packets */ + long long lost_pkt; /* total number of lost packets */ + long long too_short_initial_dgram; /* total number of too short datagrams with Initial packets */ + long long retry_sent; /* total number of Retry sent */ + long long retry_validated; /* total number of validated Retry tokens */ + long long retry_error; /* total number of Retry token errors */ + long long half_open_conn; /* current number of connections waiting for address validation */ + long long hdshk_fail; /* total number of handshake failures */ + long long stateless_reset_sent; /* total number of handshake failures */ + /* Special events of interest */ + long long conn_migration_done; /* total number of connection migration handled */ + /* Transport errors */ + long long quic_transp_err_no_error; /* total number of NO_ERROR connection errors */ + long long quic_transp_err_internal_error; /* total number of INTERNAL_ERROR connection errors */ + long long quic_transp_err_connection_refused; /* total number of CONNECTION_REFUSED connection errors */ + long long quic_transp_err_flow_control_error; /* total number of FLOW_CONTROL_ERROR connection errors */ + long long quic_transp_err_stream_limit_error; /* total number of STREAM_LIMIT_ERROR connection errors */ + long long quic_transp_err_stream_state_error; /* total number of STREAM_STATE_ERROR connection errors */ + long long quic_transp_err_final_size_error; /* total number of FINAL_SIZE_ERROR connection errors */ + long long quic_transp_err_frame_encoding_error; /* total number of FRAME_ENCODING_ERROR connection errors */ + long long quic_transp_err_transport_parameter_error; /* total number of TRANSPORT_PARAMETER_ERROR connection errors */ + long long quic_transp_err_connection_id_limit; /* total number of CONNECTION_ID_LIMIT_ERROR connection errors */ + long long quic_transp_err_protocol_violation; /* total number of PROTOCOL_VIOLATION connection errors */ + long long quic_transp_err_invalid_token; /* total number of INVALID_TOKEN connection errors */ + long long quic_transp_err_application_error; /* total number of APPLICATION_ERROR connection errors */ + long long quic_transp_err_crypto_buffer_exceeded; /* total number of CRYPTO_BUFFER_EXCEEDED connection errors */ + long long quic_transp_err_key_update_error; /* total number of KEY_UPDATE_ERROR connection errors */ + long long quic_transp_err_aead_limit_reached; /* total number of AEAD_LIMIT_REACHED connection errors */ + long long quic_transp_err_no_viable_path; /* total number of NO_VIABLE_PATH connection errors */ + long long quic_transp_err_crypto_error; /* total number of CRYPTO_ERROR connection errors */ + long long quic_transp_err_unknown_error; /* total number of UNKNOWN_ERROR connection errors */ + /* Streams related counters */ + long long data_blocked; /* total number of times DATA_BLOCKED frame was received */ + long long stream_data_blocked; /* total number of times STREAM_DATA_BLOCKED frame was received */ + long long streams_blocked_bidi; /* total number of times STREAMS_BLOCKED_BIDI frame was received */ + long long streams_blocked_uni; /* total number of times STREAMS_BLOCKED_UNI frame was received */ +}; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_STATS_T_H */ diff --git a/include/haproxy/quic_stats.h b/include/haproxy/quic_stats.h new file mode 100644 index 0000000..b2a8dec --- /dev/null +++ b/include/haproxy/quic_stats.h @@ -0,0 +1,14 @@ +#ifndef _HAPROXY_QUIC_STATS_H +#define _HAPROXY_QUIC_STATS_H + +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/quic_stats-t.h> + +void quic_stats_transp_err_count_inc(struct quic_counters *ctrs, int error_code); + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_STATS_H */ diff --git a/include/haproxy/quic_stream-t.h b/include/haproxy/quic_stream-t.h new file mode 100644 index 0000000..e10ca6d --- /dev/null +++ b/include/haproxy/quic_stream-t.h @@ -0,0 +1,48 @@ +#ifndef _HAPROXY_QUIC_STREAM_T_H_ +#define _HAPROXY_QUIC_STREAM_T_H_ + +#ifdef USE_QUIC + +#include <import/ebtree-t.h> + +#include <haproxy/buf-t.h> +#include <haproxy/list-t.h> + +/* A QUIC STREAM buffer used for Tx. + * + * Currently, no offset is associated with an offset. The qc_stream_desc must + * store them in order and keep the offset of the oldest buffer. The buffers + * can be freed in strict order. + */ +struct qc_stream_buf { + struct buffer buf; /* STREAM payload */ + struct list list; /* element for qc_stream_desc list */ +}; + +/* QUIC STREAM descriptor. + * + * This structure is the low-level counterpart of the QUIC STREAM at the MUX + * layer. It is stored in the quic-conn and provides facility for Tx buffering. + * + * Once the MUX has finished to transfer data on a STREAM, it must release its + * QUIC STREAM descriptor. The descriptor will be kept by the quic_conn until + * all acknowledgement has been received. + */ +struct qc_stream_desc { + struct eb64_node by_id; /* node for quic_conn tree */ + struct quic_conn *qc; + + struct list buf_list; /* buffers waiting for ACK, oldest offset first */ + struct qc_stream_buf *buf; /* current buffer used by the MUX */ + uint64_t buf_offset; /* base offset of current buffer */ + + uint64_t ack_offset; /* last acknowledged offset */ + struct eb_root acked_frms; /* ACK frames tree for non-contiguous ACK ranges */ + + int release; /* set to 1 when the MUX has finished to use this stream */ + + void *ctx; /* MUX specific context */ +}; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_STREAM_T_H_ */ diff --git a/include/haproxy/quic_stream.h b/include/haproxy/quic_stream.h new file mode 100644 index 0000000..4489728 --- /dev/null +++ b/include/haproxy/quic_stream.h @@ -0,0 +1,23 @@ +#ifndef _HAPROXY_QUIC_STREAM_H_ +#define _HAPROXY_QUIC_STREAM_H_ + +#ifdef USE_QUIC + +#include <haproxy/mux_quic-t.h> +#include <haproxy/quic_stream-t.h> + +struct quic_conn; + +struct qc_stream_desc *qc_stream_desc_new(uint64_t id, enum qcs_type, void *ctx, + struct quic_conn *qc); +void qc_stream_desc_release(struct qc_stream_desc *stream, uint64_t final_size); +int qc_stream_desc_ack(struct qc_stream_desc **stream, size_t offset, size_t len); +void qc_stream_desc_free(struct qc_stream_desc *stream, int closing); + +struct buffer *qc_stream_buf_get(struct qc_stream_desc *stream); +struct buffer *qc_stream_buf_alloc(struct qc_stream_desc *stream, + uint64_t offset, int *avail); +void qc_stream_buf_release(struct qc_stream_desc *stream); + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_STREAM_H_ */ diff --git a/include/haproxy/quic_tls-t.h b/include/haproxy/quic_tls-t.h new file mode 100644 index 0000000..ae65149 --- /dev/null +++ b/include/haproxy/quic_tls-t.h @@ -0,0 +1,283 @@ +/* + * include/types/quic_tls.h + * This file provides definitions for QUIC-TLS. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _TYPES_QUIC_TLS_H +#define _TYPES_QUIC_TLS_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <openssl/evp.h> + +#include <import/ebtree.h> + +#include <haproxy/ncbuf-t.h> +#include <haproxy/quic_ack-t.h> +#include <haproxy/openssl-compat.h> + +/* It seems TLS 1.3 ciphersuites macros differ between openssl and boringssl */ + +#if defined(OPENSSL_IS_BORINGSSL) || defined(OPENSSL_IS_AWSLC) +#if !defined(TLS1_3_CK_AES_128_GCM_SHA256) +#define TLS1_3_CK_AES_128_GCM_SHA256 TLS1_CK_AES_128_GCM_SHA256 +#endif +#if !defined(TLS1_3_CK_AES_256_GCM_SHA384) +#define TLS1_3_CK_AES_256_GCM_SHA384 TLS1_CK_AES_256_GCM_SHA384 +#endif +#if !defined(TLS1_3_CK_CHACHA20_POLY1305_SHA256) +#define TLS1_3_CK_CHACHA20_POLY1305_SHA256 TLS1_CK_CHACHA20_POLY1305_SHA256 +#endif +#if !defined(TLS1_3_CK_AES_128_CCM_SHA256) +/* Note that TLS1_CK_AES_128_CCM_SHA256 is not defined in boringssl */ +#define TLS1_3_CK_AES_128_CCM_SHA256 0x03001304 +#endif +#endif + +/* AEAD iv and secrete key lengths */ +#define QUIC_TLS_IV_LEN 12 /* bytes */ +#define QUIC_TLS_KEY_LEN 32 /* bytes */ +#define QUIC_TLS_SECRET_LEN 48 /* bytes */ +/* The ciphersuites for AEAD QUIC-TLS have 16-bytes authentication tags */ +#define QUIC_TLS_TAG_LEN 16 /* bytes */ + +/* The TLS extensions for QUIC transport parameters */ +#define TLS_EXTENSION_QUIC_TRANSPORT_PARAMETERS 0x0039 +#define TLS_EXTENSION_QUIC_TRANSPORT_PARAMETERS_DRAFT 0xffa5 + +extern struct pool_head *pool_head_quic_pktns; +extern struct pool_head *pool_head_quic_enc_level; +extern struct pool_head *pool_head_quic_tls_ctx; +extern struct pool_head *pool_head_quic_tls_secret; +extern struct pool_head *pool_head_quic_tls_iv; +extern struct pool_head *pool_head_quic_tls_key; + +#define QUIC_HKDF_KEY_LABEL_V1 "quic key" +#define QUIC_HKDF_IV_LABEL_V1 "quic iv" +#define QUIC_HKDF_HP_LABEL_V1 "quic hp" +#define QUIC_HKDF_KU_LABEL_V1 "quic ku" + +#define QUIC_HKDF_KEY_LABEL_V2 "quicv2 key" +#define QUIC_HKDF_IV_LABEL_V2 "quicv2 iv" +#define QUIC_HKDF_HP_LABEL_V2 "quicv2 hp" +#define QUIC_HKDF_KU_LABEL_V2 "quicv2 ku" + +#define QUIC_TLS_RETRY_KEY_DRAFT \ + "\xcc\xce\x18\x7e\xd0\x9a\x09\xd0\x57\x28\x15\x5a\x6c\xb9\x6b\xe1" +#define QUIC_TLS_RETRY_NONCE_DRAFT \ + "\xe5\x49\x30\xf9\x7f\x21\x36\xf0\x53\x0a\x8c\x1c" +#define QUIC_TLS_RETRY_KEY_V1 \ + "\xbe\x0c\x69\x0b\x9f\x66\x57\x5a\x1d\x76\x6b\x54\xe3\x68\xc8\x4e" +#define QUIC_TLS_RETRY_NONCE_V1 \ + "\x46\x15\x99\xd3\x5d\x63\x2b\xf2\x23\x98\x25\xbb" +#define QUIC_TLS_RETRY_KEY_V2 \ + "\x8f\xb4\xb0\x1b\x56\xac\x48\xe2\x60\xfb\xcb\xce\xad\x7c\xcc\x92" +#define QUIC_TLS_RETRY_NONCE_V2 \ + "\xd8\x69\x69\xbc\x2d\x7c\x6d\x99\x90\xef\xb0\x4a" + +/* QUIC handshake states for both clients and servers. */ +enum quic_handshake_state { + QUIC_HS_ST_CLIENT_HANDSHAKE_FAILED, + QUIC_HS_ST_SERVER_HANDSHAKE_FAILED, + + QUIC_HS_ST_CLIENT_INITIAL, + QUIC_HS_ST_CLIENT_HANDSHAKE, + + QUIC_HS_ST_SERVER_INITIAL, + QUIC_HS_ST_SERVER_HANDSHAKE, + + /* Common to servers and clients */ + QUIC_HS_ST_COMPLETE, + QUIC_HS_ST_CONFIRMED, +}; + +/* QUIC TLS level encryption */ +enum quic_tls_enc_level { + QUIC_TLS_ENC_LEVEL_NONE = -1, + QUIC_TLS_ENC_LEVEL_INITIAL, + QUIC_TLS_ENC_LEVEL_EARLY_DATA, + QUIC_TLS_ENC_LEVEL_HANDSHAKE, + QUIC_TLS_ENC_LEVEL_APP, + /* Please do not insert any value after this following one */ + QUIC_TLS_ENC_LEVEL_MAX, +}; + +/* QUIC packet number spaces */ +enum quic_tls_pktns { + QUIC_TLS_PKTNS_INITIAL, + QUIC_TLS_PKTNS_HANDSHAKE, + QUIC_TLS_PKTNS_01RTT, + /* Please do not insert any value after this following one */ + QUIC_TLS_PKTNS_MAX, +}; + +extern unsigned char initial_salt[20]; +extern const unsigned char initial_salt_draft_29[20]; +extern const unsigned char initial_salt_v1[20]; +extern const unsigned char initial_salt_v2[20]; + +/* QUIC packet number space */ +struct quic_pktns { + struct list list; + struct { + /* List of frames to send. */ + struct list frms; + /* Next packet number to use for transmissions. */ + int64_t next_pn; + /* The packet which has been sent. */ + struct eb_root pkts; + /* The time the most recent ack-eliciting packer was sent. */ + unsigned int time_of_last_eliciting; + /* The time this packet number space has experienced packet loss. */ + unsigned int loss_time; + /* Boolean to denote if we must send probe packet. */ + unsigned int pto_probe; + /* In flight bytes for this packet number space. */ + size_t in_flight; + /* The acknowledgement delay of the packet with the largest packet number */ + uint64_t ack_delay; + } tx; + struct { + /* Largest packet number */ + int64_t largest_pn; + /* Largest acked sent packet. */ + int64_t largest_acked_pn; + struct quic_arngs arngs; + unsigned int nb_aepkts_since_last_ack; + /* The time the packet with the largest packet number was received */ + uint64_t largest_time_received; + } rx; + unsigned int flags; +}; + +/* Key phase used for Key Update */ +struct quic_tls_kp { + EVP_CIPHER_CTX *ctx; + unsigned char *secret; + size_t secretlen; + unsigned char *iv; + size_t ivlen; + unsigned char *key; + size_t keylen; + uint64_t count; + int64_t pn; + unsigned char flags; +}; + +/* Key update phase bit */ +#define QUIC_FL_TLS_KP_BIT_SET (1 << 0) + +struct quic_tls_secrets { + EVP_CIPHER_CTX *ctx; + const EVP_CIPHER *aead; + const EVP_MD *md; + EVP_CIPHER_CTX *hp_ctx; + const EVP_CIPHER *hp; + unsigned char *secret; + size_t secretlen; + /* Header protection key. + * Note: the header protection is applied after packet protection. + * As the header belong to the data, its protection must be removed before removing + * the packet protection. + */ + unsigned char hp_key[32]; + unsigned char *iv; + size_t ivlen; + unsigned char *key; + size_t keylen; + /* Used only on the RX part to store the largest received packet number */ + int64_t pn; +}; + +struct quic_tls_ctx { + struct quic_tls_secrets rx; + struct quic_tls_secrets tx; + unsigned char flags; +}; + +#define QUIC_CRYPTO_BUF_SHIFT 10 +#define QUIC_CRYPTO_BUF_MASK ((1UL << QUIC_CRYPTO_BUF_SHIFT) - 1) +/* The maximum allowed size of CRYPTO data buffer provided by the TLS stack. */ +#define QUIC_CRYPTO_BUF_SZ (1UL << QUIC_CRYPTO_BUF_SHIFT) /* 1 KB */ + +extern struct pool_head *pool_head_quic_crypto_buf; + +/* + * CRYPTO buffer struct. + * Such buffers are used to send CRYPTO data. + */ +struct quic_crypto_buf { + unsigned char data[QUIC_CRYPTO_BUF_SZ]; + size_t sz; +}; + +/* Crypto data stream (one by encryption level) */ +struct quic_cstream { + struct { + uint64_t offset; /* absolute current base offset of ncbuf */ + struct ncbuf ncbuf; /* receive buffer - can handle out-of-order offset frames */ + } rx; + struct { + uint64_t offset; /* last offset of data ready to be sent */ + uint64_t sent_offset; /* last offset sent by transport layer */ + struct buffer buf; /* transmit buffer before sending via xprt */ + } tx; + + struct qc_stream_desc *desc; +}; + +struct quic_enc_level { + struct list list; + /* Attach point to enqueue this encryption level during retransmissions */ + struct list retrans; + /* pointer to list used only during retransmissions */ + struct list *retrans_frms; + /* Encryption level, as defined by the TLS stack. */ + enum ssl_encryption_level_t level; + /* TLS encryption context (AEAD only) */ + struct quic_tls_ctx tls_ctx; + + /* RX part */ + struct { + /* The packets received by the listener I/O handler + * with header protection removed. + */ + struct eb_root pkts; + /* List of QUIC packets with protected header. */ + struct list pqpkts; + /* List of crypto frames received in order. */ + struct list crypto_frms; + } rx; + + /* TX part */ + struct { + struct { + /* Array of CRYPTO data buffers */ + struct quic_crypto_buf **bufs; + /* The number of element in use in the previous array. */ + size_t nb_buf; + /* The total size of the CRYPTO data stored in the CRYPTO buffers. */ + size_t sz; + /* The offset of the CRYPT0 data stream. */ + uint64_t offset; + } crypto; + } tx; + + /* Crypto data stream */ + struct quic_cstream *cstream; + /* Packet number space */ + struct quic_pktns *pktns; +}; + +#endif /* USE_QUIC */ +#endif /* _TYPES_QUIC_TLS_H */ + diff --git a/include/haproxy/quic_tls.h b/include/haproxy/quic_tls.h new file mode 100644 index 0000000..86b8c1e --- /dev/null +++ b/include/haproxy/quic_tls.h @@ -0,0 +1,1116 @@ +/* + * include/proto/quic_tls.h + * This file provides definitions for QUIC-TLS. + * + * Copyright 2019 HAProxy Technologies, Frederic Lecaille <flecaille@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _PROTO_QUIC_TLS_H +#define _PROTO_QUIC_TLS_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/dynbuf.h> +#include <haproxy/pool.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/quic_conn.h> +#include <haproxy/quic_frame.h> +#include <haproxy/quic_tls-t.h> +#include <haproxy/quic_tx.h> +#include <haproxy/quic_trace.h> +#include <haproxy/trace.h> + +int quic_tls_finalize(struct quic_conn *qc, int server); +void quic_tls_ctx_free(struct quic_tls_ctx **ctx); +void quic_pktns_release(struct quic_conn *qc, struct quic_pktns **pktns); +int qc_enc_level_alloc(struct quic_conn *qc, struct quic_pktns **pktns, + struct quic_enc_level **qel, enum ssl_encryption_level_t level); +void qc_enc_level_free(struct quic_conn *qc, struct quic_enc_level **qel); + +void quic_tls_keys_hexdump(struct buffer *buf, + const struct quic_tls_secrets *secs); +void quic_tls_kp_keys_hexdump(struct buffer *buf, + const struct quic_tls_kp *kp); + +void quic_conn_enc_level_uninit(struct quic_conn *qc, struct quic_enc_level *qel); +void quic_tls_secret_hexdump(struct buffer *buf, + const unsigned char *secret, size_t secret_len); + +int quic_derive_initial_secret(const EVP_MD *md, + const unsigned char *initial_salt, size_t initial_salt_sz, + unsigned char *initial_secret, size_t initial_secret_sz, + const unsigned char *secret, size_t secret_sz); + +int quic_tls_derive_initial_secrets(const EVP_MD *md, + unsigned char *rx, size_t rx_sz, + unsigned char *tx, size_t tx_sz, + const unsigned char *secret, size_t secret_sz, + int server); + +int quic_tls_encrypt(unsigned char *buf, size_t len, + const unsigned char *aad, size_t aad_len, + EVP_CIPHER_CTX *ctx, const EVP_CIPHER *aead, + const unsigned char *iv); + +int quic_tls_decrypt2(unsigned char *out, + unsigned char *in, size_t ilen, + unsigned char *aad, size_t aad_len, + EVP_CIPHER_CTX *ctx, const EVP_CIPHER *aead, + const unsigned char *key, const unsigned char *iv); + +int quic_tls_decrypt(unsigned char *buf, size_t len, + unsigned char *aad, size_t aad_len, + EVP_CIPHER_CTX *tls_ctx, const EVP_CIPHER *aead, + const unsigned char *key, const unsigned char *iv); + +int quic_tls_generate_retry_integrity_tag(unsigned char *odcid, unsigned char odcid_len, + unsigned char *buf, size_t len, + const struct quic_version *qv); + +int quic_tls_derive_keys(const EVP_CIPHER *aead, const EVP_CIPHER *hp, + const EVP_MD *md, const struct quic_version *qv, + unsigned char *key, size_t keylen, + unsigned char *iv, size_t ivlen, + unsigned char *hp_key, size_t hp_keylen, + const unsigned char *secret, size_t secretlen); + +int quic_tls_derive_retry_token_secret(const EVP_MD *md, + unsigned char *key, size_t keylen, + unsigned char *iv, size_t ivlen, + const unsigned char *salt, size_t saltlen, + const unsigned char *secret, size_t secretlen); + +int quic_hkdf_expand(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *label, size_t labellen); + +int quic_hkdf_expand_label(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *label, size_t labellen); + +int quic_hkdf_extract_and_expand(const EVP_MD *md, + unsigned char *buf, size_t buflen, + const unsigned char *key, size_t keylen, + const unsigned char *salt, size_t saltlen, + const unsigned char *label, size_t labellen); + +int quic_tls_rx_ctx_init(EVP_CIPHER_CTX **rx_ctx, + const EVP_CIPHER *aead, unsigned char *key); +int quic_tls_tx_ctx_init(EVP_CIPHER_CTX **tx_ctx, + const EVP_CIPHER *aead, unsigned char *key); + +int quic_tls_sec_update(const EVP_MD *md, const struct quic_version *qv, + unsigned char *new_sec, size_t new_seclen, + const unsigned char *sec, size_t seclen); + +void quic_aead_iv_build(unsigned char *iv, size_t ivlen, + unsigned char *aead_iv, size_t aead_ivlen, uint64_t pn); + +/* HP protection (AES) */ +int quic_tls_dec_aes_ctx_init(EVP_CIPHER_CTX **aes_ctx, + const EVP_CIPHER *aes, unsigned char *key); +int quic_tls_enc_aes_ctx_init(EVP_CIPHER_CTX **aes_ctx, + const EVP_CIPHER *aes, unsigned char *key); +int quic_tls_aes_decrypt(unsigned char *out, + const unsigned char *in, size_t inlen, + EVP_CIPHER_CTX *ctx); +int quic_tls_aes_encrypt(unsigned char *out, + const unsigned char *in, size_t inlen, + EVP_CIPHER_CTX *ctx); + +int quic_tls_key_update(struct quic_conn *qc); +void quic_tls_rotate_keys(struct quic_conn *qc); + +static inline const EVP_CIPHER *tls_aead(const SSL_CIPHER *cipher) +{ + switch (SSL_CIPHER_get_id(cipher)) { + case TLS1_3_CK_AES_128_GCM_SHA256: + return EVP_aes_128_gcm(); + case TLS1_3_CK_AES_256_GCM_SHA384: + return EVP_aes_256_gcm(); +#if !defined(OPENSSL_IS_AWSLC) + case TLS1_3_CK_CHACHA20_POLY1305_SHA256: + return EVP_chacha20_poly1305(); +#endif +#if !defined(USE_OPENSSL_WOLFSSL) && !defined(OPENSSL_IS_AWSLC) + case TLS1_3_CK_AES_128_CCM_SHA256: + return EVP_aes_128_ccm(); +#endif + default: + return NULL; + } +} + +static inline const EVP_MD *tls_md(const SSL_CIPHER *cipher) +{ + switch (SSL_CIPHER_get_id(cipher)) { + case TLS1_3_CK_AES_128_GCM_SHA256: + case TLS1_3_CK_AES_128_CCM_SHA256: + case TLS1_3_CK_CHACHA20_POLY1305_SHA256: + return EVP_sha256(); + case TLS1_3_CK_AES_256_GCM_SHA384: + return EVP_sha384(); + default: + return NULL; + } +} + +static inline const EVP_CIPHER *tls_hp(const SSL_CIPHER *cipher) +{ + switch (SSL_CIPHER_get_id(cipher)) { +#if !defined(OPENSSL_IS_AWSLC) + case TLS1_3_CK_CHACHA20_POLY1305_SHA256: + return EVP_chacha20(); +#endif + case TLS1_3_CK_AES_128_CCM_SHA256: + case TLS1_3_CK_AES_128_GCM_SHA256: + return EVP_aes_128_ctr(); + case TLS1_3_CK_AES_256_GCM_SHA384: + return EVP_aes_256_ctr(); + default: + return NULL; + } + +} + +/* These following functions map TLS implementation encryption level to ours */ +static inline struct quic_pktns **ssl_to_quic_pktns(struct quic_conn *qc, + enum ssl_encryption_level_t level) +{ + switch (level) { + case ssl_encryption_initial: + return &qc->ipktns; + case ssl_encryption_early_data: + return &qc->apktns; + case ssl_encryption_handshake: + return &qc->hpktns; + case ssl_encryption_application: + return &qc->apktns; + default: + return NULL; + } +} + +/* These following functions map TLS implementation encryption level to ours */ +static inline struct quic_pktns **qel_to_quic_pktns(struct quic_conn *qc, + enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return &qc->ipktns; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + return &qc->apktns; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return &qc->hpktns; + case QUIC_TLS_ENC_LEVEL_APP: + return &qc->apktns; + default: + return NULL; + } +} + +/* Map <level> TLS stack encryption level to our internal QUIC TLS encryption level + * if succeeded, or -1 if failed. + */ +static inline enum quic_tls_enc_level ssl_to_quic_enc_level(enum ssl_encryption_level_t level) +{ + switch (level) { + case ssl_encryption_initial: + return QUIC_TLS_ENC_LEVEL_INITIAL; + case ssl_encryption_early_data: + return QUIC_TLS_ENC_LEVEL_EARLY_DATA; + case ssl_encryption_handshake: + return QUIC_TLS_ENC_LEVEL_HANDSHAKE; + case ssl_encryption_application: + return QUIC_TLS_ENC_LEVEL_APP; + default: + return -1; + } +} + +/* Return the address of the QUIC TLS encryption level associated to <level> TLS + * stack encryption level and attached to <qc> QUIC connection if succeeded, or + * NULL if failed. + */ +static inline struct quic_enc_level **ssl_to_qel_addr(struct quic_conn *qc, + enum ssl_encryption_level_t level) +{ + switch (level) { + case ssl_encryption_initial: + return &qc->iel; + case ssl_encryption_early_data: + return &qc->eel; + case ssl_encryption_handshake: + return &qc->hel; + case ssl_encryption_application: + return &qc->ael; + default: + return NULL; + } +} + +/* Return the address of the QUIC TLS encryption level associated to <level> internal + * encryption level and attached to <qc> QUIC connection if succeeded, or + * NULL if failed. + */ +static inline struct quic_enc_level **qel_to_qel_addr(struct quic_conn *qc, + enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return &qc->iel; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + return &qc->eel; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return &qc->hel; + case QUIC_TLS_ENC_LEVEL_APP: + return &qc->ael; + default: + return NULL; + } +} + +/* Return the QUIC TLS encryption level associated to <level> internal encryption + * level attached to <qc> QUIC connection if succeeded, or NULL if failed. + */ +static inline struct quic_enc_level *qc_quic_enc_level(const struct quic_conn *qc, + enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return qc->iel; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + return qc->eel; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return qc->hel; + case QUIC_TLS_ENC_LEVEL_APP: + return qc->ael; + default: + return NULL; + } +} + +/* These two following functions map our encryption level to the TLS implementation ones. */ +static inline enum ssl_encryption_level_t quic_to_ssl_enc_level(enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return ssl_encryption_initial; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + return ssl_encryption_early_data; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return ssl_encryption_handshake; + case QUIC_TLS_ENC_LEVEL_APP: + return ssl_encryption_application; + default: + return -1; + } +} + +/* Return a human readable string from <state> QUIC handshake state of NULL + * for unknown state values (for debug purpose). + */ +static inline char *quic_hdshk_state_str(const enum quic_handshake_state state) +{ + switch (state) { + case QUIC_HS_ST_CLIENT_INITIAL: + return "CI"; + case QUIC_HS_ST_CLIENT_HANDSHAKE: + return "CH"; + case QUIC_HS_ST_CLIENT_HANDSHAKE_FAILED: + return "CF"; + case QUIC_HS_ST_SERVER_INITIAL: + return "SI"; + case QUIC_HS_ST_SERVER_HANDSHAKE: + return "SH"; + case QUIC_HS_ST_SERVER_HANDSHAKE_FAILED: + return "SF"; + case QUIC_HS_ST_COMPLETE: + return "HCP"; + case QUIC_HS_ST_CONFIRMED: + return "HCF"; + } + + return NULL; +} + +/* Return a human readable string from <err> SSL error (returned from + * SSL_get_error()) + */ +static inline const char *ssl_error_str(int err) +{ + switch (err) { + case SSL_ERROR_NONE: + return "NONE"; + case SSL_ERROR_SSL: + return "SSL"; + case SSL_ERROR_WANT_READ: + return "WANT_READ"; + case SSL_ERROR_WANT_WRITE: + return "WANT_WRITE"; + case SSL_ERROR_WANT_X509_LOOKUP: + return "X509_LOOKUP"; + case SSL_ERROR_SYSCALL: + return "SYSCALL"; + case SSL_ERROR_ZERO_RETURN: + return "ZERO_RETURN"; + case SSL_ERROR_WANT_CONNECT: + return "WANT_CONNECT"; + case SSL_ERROR_WANT_ACCEPT: + return "WANT_ACCEPT"; +#if !defined(LIBRESSL_VERSION_NUMBER) && !defined(USE_OPENSSL_WOLFSSL) && !defined(OPENSSL_IS_AWSLC) + case SSL_ERROR_WANT_ASYNC: + return "WANT_ASYNC"; + case SSL_ERROR_WANT_ASYNC_JOB: + return "WANT_ASYNC_JOB"; + case SSL_ERROR_WANT_CLIENT_HELLO_CB: + return "WANT_CLIENT_HELLO_CB"; +#endif + default: + return "UNKNOWN"; + } +} + + +/* Return a character identifying the encryption level from <level> QUIC TLS + * encryption level (for debug purpose). + * Initial -> 'I', Early Data -> 'E', Handshake -> 'H', Application -> 'A' and + * '-' if undefined. + */ +static inline char quic_enc_level_char(enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return 'I'; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + return 'E'; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return 'H'; + case QUIC_TLS_ENC_LEVEL_APP: + return 'A'; + default: + return '-'; + } +} + +/* Return a character identifying <qel> encryption level from <qc> QUIC connection + * (for debug purpose). + * Initial -> 'I', Early Data -> 'E', Handshake -> 'H', Application -> 'A' and + * '-' if undefined. + */ +static inline char quic_enc_level_char_from_qel(const struct quic_enc_level *qel, + const struct quic_conn *qc) +{ + if (qel == qc->iel) + return 'I'; + else if (qel == qc->eel) + return 'E'; + else if (qel == qc->hel) + return 'H'; + else if (qel == qc->ael) + return 'A'; + return '-'; +} + +/* Return a character identifying the encryption level of a packet depending on + * its <type> type, and its <long_header> header length (for debug purpose). + * Initial -> 'I', ORTT -> '0', Handshake -> 'H', Application -> 'A' and + * '-' if undefined. + */ +static inline char quic_packet_type_enc_level_char(int packet_type) +{ + switch (packet_type) { + case QUIC_PACKET_TYPE_INITIAL: + return 'I'; + case QUIC_PACKET_TYPE_0RTT: + return '0'; + case QUIC_PACKET_TYPE_HANDSHAKE: + return 'H'; + case QUIC_PACKET_TYPE_SHORT: + return 'A'; + default: + return '-'; + } +} + +/* Initialize a QUIC packet number space. + * Never fails. + */ +static inline int quic_pktns_init(struct quic_conn *qc, struct quic_pktns **p) +{ + struct quic_pktns *pktns; + + pktns = pool_alloc(pool_head_quic_pktns); + if (!pktns) + return 0; + + LIST_INIT(&pktns->tx.frms); + pktns->tx.next_pn = -1; + pktns->tx.pkts = EB_ROOT_UNIQUE; + pktns->tx.time_of_last_eliciting = 0; + pktns->tx.loss_time = TICK_ETERNITY; + pktns->tx.pto_probe = 0; + pktns->tx.in_flight = 0; + pktns->tx.ack_delay = 0; + + pktns->rx.largest_pn = -1; + pktns->rx.largest_acked_pn = -1; + pktns->rx.arngs.root = EB_ROOT_UNIQUE; + pktns->rx.arngs.sz = 0; + pktns->rx.arngs.enc_sz = 0; + pktns->rx.nb_aepkts_since_last_ack = 0; + pktns->rx.largest_time_received = 0; + + pktns->flags = 0; + if (p == &qc->hpktns && qc->apktns) + LIST_INSERT(&qc->ipktns->list, &pktns->list); + else + LIST_APPEND(&qc->pktns_list, &pktns->list); + *p = pktns; + + return 1; +} + +static inline void quic_pktns_tx_pkts_release(struct quic_pktns *pktns, struct quic_conn *qc) +{ + struct eb64_node *node; + + TRACE_ENTER(QUIC_EV_CONN_PHPKTS, qc); + + node = eb64_first(&pktns->tx.pkts); + while (node) { + struct quic_tx_packet *pkt; + struct quic_frame *frm, *frmbak; + + pkt = eb64_entry(node, struct quic_tx_packet, pn_node); + node = eb64_next(node); + if (pkt->flags & QUIC_FL_TX_PACKET_ACK_ELICITING) + qc->path->ifae_pkts--; + list_for_each_entry_safe(frm, frmbak, &pkt->frms, list) { + TRACE_DEVEL("freeing frame from packet", + QUIC_EV_CONN_PRSAFRM, qc, frm, &pkt->pn_node.key); + qc_frm_unref(frm, qc); + LIST_DEL_INIT(&frm->list); + quic_tx_packet_refdec(frm->pkt); + qc_frm_free(qc, &frm); + } + eb64_delete(&pkt->pn_node); + quic_tx_packet_refdec(pkt); + } + + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); +} + +/* Discard <pktns> packet number space attached to <qc> QUIC connection. + * Its loss information are reset. Deduce the outstanding bytes for this + * packet number space from the outstanding bytes for the path of this + * connection. + * Note that all the non acknowledged TX packets and their frames are freed. + * Always succeeds. + */ +static inline void quic_pktns_discard(struct quic_pktns *pktns, + struct quic_conn *qc) +{ + TRACE_ENTER(QUIC_EV_CONN_PHPKTS, qc); + + if (pktns == qc->ipktns) + qc->flags |= QUIC_FL_CONN_IPKTNS_DCD; + else if (pktns == qc->hpktns) + qc->flags |= QUIC_FL_CONN_HPKTNS_DCD; + qc->path->in_flight -= pktns->tx.in_flight; + qc->path->prep_in_flight -= pktns->tx.in_flight; + qc->path->loss.pto_count = 0; + + pktns->tx.time_of_last_eliciting = 0; + pktns->tx.loss_time = TICK_ETERNITY; + pktns->tx.pto_probe = 0; + pktns->tx.in_flight = 0; + quic_pktns_tx_pkts_release(pktns, qc); + + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); +} + + +/* Release all the frames attached to <pktns> packet number space */ +static inline void qc_release_pktns_frms(struct quic_conn *qc, + struct quic_pktns *pktns) +{ + struct quic_frame *frm, *frmbak; + + TRACE_ENTER(QUIC_EV_CONN_PHPKTS, qc); + + if (!pktns) + goto leave; + + list_for_each_entry_safe(frm, frmbak, &pktns->tx.frms, list) + qc_frm_free(qc, &frm); + + leave: + TRACE_LEAVE(QUIC_EV_CONN_PHPKTS, qc); +} + +/* Return 1 if <pktns> matches with the Application packet number space of + * <conn> connection which is common to the 0-RTT and 1-RTT encryption levels, 0 + * if not (handshake packets). + */ +static inline int quic_application_pktns(struct quic_pktns *pktns, struct quic_conn *qc) +{ + return pktns == qc->apktns; +} + +/* Returns the current largest acknowledged packet number if exists, -1 if not */ +static inline int64_t quic_pktns_get_largest_acked_pn(struct quic_pktns *pktns) +{ + struct eb64_node *ar = eb64_last(&pktns->rx.arngs.root); + + if (!ar) + return -1; + + return eb64_entry(ar, struct quic_arng_node, first)->last; +} + +/* Return a character to identify the packet number space <pktns> of <qc> QUIC + * connection. 'I' for Initial packet number space, 'H' for Handshake packet + * space, and 'A' for Application data number space, or '-' if not found. + */ +static inline char quic_pktns_char(const struct quic_conn *qc, + const struct quic_pktns *pktns) +{ + if (pktns == qc->apktns) + return 'A'; + else if (pktns == qc->hpktns) + return 'H'; + else if (pktns == qc->ipktns) + return 'I'; + + return '-'; +} + +/* Return the TLS encryption level to be used for <packet_type> + * QUIC packet type. + * Returns -1 if there is no TLS encryption level for <packet_type> + * packet type. + */ +static inline enum quic_tls_enc_level quic_packet_type_enc_level(enum quic_pkt_type packet_type) +{ + switch (packet_type) { + case QUIC_PACKET_TYPE_INITIAL: + return QUIC_TLS_ENC_LEVEL_INITIAL; + case QUIC_PACKET_TYPE_0RTT: + return QUIC_TLS_ENC_LEVEL_EARLY_DATA; + case QUIC_PACKET_TYPE_HANDSHAKE: + return QUIC_TLS_ENC_LEVEL_HANDSHAKE; + case QUIC_PACKET_TYPE_RETRY: + return QUIC_TLS_ENC_LEVEL_NONE; + case QUIC_PACKET_TYPE_SHORT: + return QUIC_TLS_ENC_LEVEL_APP; + default: + return QUIC_TLS_ENC_LEVEL_NONE; + } +} + +static inline enum quic_tls_pktns quic_tls_pktns(enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return QUIC_TLS_PKTNS_INITIAL; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + case QUIC_TLS_ENC_LEVEL_APP: + return QUIC_TLS_PKTNS_01RTT; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return QUIC_TLS_PKTNS_HANDSHAKE; + default: + return -1; + } +} + +/* Return 1 if <pktns> packet number space attached to <qc> connection has been discarded, + * 0 if not. + */ +static inline int quic_tls_pktns_is_dcd(struct quic_conn *qc, struct quic_pktns *pktns) +{ + if (pktns == qc->apktns) + return 0; + + if ((pktns == qc->ipktns && (qc->flags & QUIC_FL_CONN_IPKTNS_DCD)) || + (pktns == qc->hpktns && (qc->flags & QUIC_FL_CONN_HPKTNS_DCD))) + return 1; + + return 0; +} + +/* Return 1 the packet number space attached to <qc> connection with <type> associated + * packet type has been discarded, 0 if not. + */ +static inline int quic_tls_pkt_type_pktns_dcd(struct quic_conn *qc, unsigned char type) +{ + if ((type == QUIC_PACKET_TYPE_INITIAL && (qc->flags & QUIC_FL_CONN_IPKTNS_DCD)) || + (type == QUIC_PACKET_TYPE_HANDSHAKE && (qc->flags & QUIC_FL_CONN_HPKTNS_DCD))) + return 1; + + return 0; +} + +/* Select the correct TLS cipher context to used to decipher an RX packet + * with <type> as type and <version> as version and attached to <qc> + * connection from <qel> encryption level. + */ +static inline struct quic_tls_ctx *qc_select_tls_ctx(struct quic_conn *qc, + struct quic_enc_level *qel, + unsigned char type, + const struct quic_version *version) +{ + return type != QUIC_PACKET_TYPE_INITIAL ? &qel->tls_ctx : + version == qc->negotiated_version ? qc->nictx : &qel->tls_ctx; +} + +/* Reset all members of <ctx> to default values, ->hp_key[] excepted */ +static inline void quic_tls_ctx_reset(struct quic_tls_ctx *ctx) +{ + ctx->rx.ctx = NULL; + ctx->rx.aead = NULL; + ctx->rx.md = NULL; + ctx->rx.hp_ctx = NULL; + ctx->rx.hp = NULL; + ctx->rx.secret = NULL; + ctx->rx.secretlen = 0; + ctx->rx.iv = NULL; + ctx->rx.ivlen = 0; + ctx->rx.key = NULL; + ctx->rx.keylen = 0; + ctx->rx.pn = 0; + + ctx->tx.ctx = NULL; + ctx->tx.aead = NULL; + ctx->tx.md = NULL; + ctx->tx.hp_ctx = NULL; + ctx->tx.hp = NULL; + ctx->tx.secret = NULL; + ctx->tx.secretlen = 0; + ctx->tx.iv = NULL; + ctx->tx.ivlen = 0; + ctx->tx.key = NULL; + ctx->tx.keylen = 0; + /* Not used on the TX path. */ + ctx->tx.pn = 0; + + ctx->flags = 0; +} + +/* Erase and free the secrets for a QUIC encryption level with <ctx> as + * context. + * Always succeeds. + */ +static inline void quic_tls_ctx_secs_free(struct quic_tls_ctx *ctx) +{ + if (!ctx) + return; + + if (ctx->rx.iv) { + memset(ctx->rx.iv, 0, ctx->rx.ivlen); + ctx->rx.ivlen = 0; + } + if (ctx->rx.key) { + memset(ctx->rx.key, 0, ctx->rx.keylen); + ctx->rx.keylen = 0; + } + if (ctx->tx.iv) { + memset(ctx->tx.iv, 0, ctx->tx.ivlen); + ctx->tx.ivlen = 0; + } + if (ctx->tx.key) { + memset(ctx->tx.key, 0, ctx->tx.keylen); + ctx->tx.keylen = 0; + } + + /* RX HP protection */ + EVP_CIPHER_CTX_free(ctx->rx.hp_ctx); + /* RX AEAD decryption */ + EVP_CIPHER_CTX_free(ctx->rx.ctx); + pool_free(pool_head_quic_tls_iv, ctx->rx.iv); + pool_free(pool_head_quic_tls_key, ctx->rx.key); + + /* TX HP protection */ + EVP_CIPHER_CTX_free(ctx->tx.hp_ctx); + /* TX AEAD encryption */ + EVP_CIPHER_CTX_free(ctx->tx.ctx); + pool_free(pool_head_quic_tls_iv, ctx->tx.iv); + pool_free(pool_head_quic_tls_key, ctx->tx.key); + + quic_tls_ctx_reset(ctx); +} + +/* Allocate the secrete keys for a QUIC encryption level with <ctx> as context. + * Returns 1 if succeeded, 0 if not. + */ +static inline int quic_tls_ctx_keys_alloc(struct quic_tls_ctx *ctx) +{ + if (ctx->rx.key) + goto write; + + if (!(ctx->rx.iv = pool_alloc(pool_head_quic_tls_iv)) || + !(ctx->rx.key = pool_alloc(pool_head_quic_tls_key))) + goto err; + + write: + if (ctx->tx.key) + goto out; + + if (!(ctx->tx.iv = pool_alloc(pool_head_quic_tls_iv)) || + !(ctx->tx.key = pool_alloc(pool_head_quic_tls_key))) + goto err; + + ctx->rx.ivlen = ctx->tx.ivlen = QUIC_TLS_IV_LEN; + ctx->rx.keylen = ctx->tx.keylen = QUIC_TLS_KEY_LEN; +out: + return 1; + + err: + quic_tls_ctx_secs_free(ctx); + return 0; +} + +/* Release the memory allocated for <secs> secrets */ +static inline void quic_tls_secrets_keys_free(struct quic_tls_secrets *secs) +{ + if (secs->iv) { + memset(secs->iv, 0, secs->ivlen); + secs->ivlen = 0; + } + + if (secs->key) { + memset(secs->key, 0, secs->keylen); + secs->keylen = 0; + } + + /* HP protection */ + EVP_CIPHER_CTX_free(secs->hp_ctx); + /* AEAD decryption */ + EVP_CIPHER_CTX_free(secs->ctx); + pool_free(pool_head_quic_tls_iv, secs->iv); + pool_free(pool_head_quic_tls_key, secs->key); + + secs->iv = secs->key = NULL; +} + +/* Allocate the memory for the <secs> secrets. + * Return 1 if succeeded, 0 if not. + */ +static inline int quic_tls_secrets_keys_alloc(struct quic_tls_secrets *secs) +{ + if (!(secs->iv = pool_alloc(pool_head_quic_tls_iv)) || + !(secs->key = pool_alloc(pool_head_quic_tls_key))) + goto err; + + secs->ivlen = QUIC_TLS_IV_LEN; + secs->keylen = QUIC_TLS_KEY_LEN; + + return 1; + + err: + quic_tls_secrets_keys_free(secs); + return 0; +} + +/* Release the memory allocated for the negotiated Initial QUIC TLS context + * attached to <qc> connection. + */ +static inline void quic_nictx_free(struct quic_conn *qc) +{ + quic_tls_ctx_secs_free(qc->nictx); + pool_free(pool_head_quic_tls_ctx, qc->nictx); + qc->nictx = NULL; +} + +/* Initialize a TLS cryptographic context for the Initial encryption level. */ +static inline int quic_initial_tls_ctx_init(struct quic_tls_ctx *ctx) +{ + ctx->rx.aead = ctx->tx.aead = EVP_aes_128_gcm(); + ctx->rx.md = ctx->tx.md = EVP_sha256(); + ctx->rx.hp = ctx->tx.hp = EVP_aes_128_ctr(); + + ctx->rx.iv = NULL; + ctx->rx.ivlen = 0; + ctx->rx.key = NULL; + ctx->rx.keylen = 0; + ctx->rx.secret = NULL; + ctx->rx.secretlen = 0; + + ctx->tx.iv = NULL; + ctx->tx.ivlen = 0; + ctx->tx.key = NULL; + ctx->tx.keylen = 0; + ctx->tx.secret = NULL; + ctx->tx.secretlen = 0; + + return quic_tls_ctx_keys_alloc(ctx); +} + +static inline int quic_tls_level_pkt_type(enum quic_tls_enc_level level) +{ + switch (level) { + case QUIC_TLS_ENC_LEVEL_INITIAL: + return QUIC_PACKET_TYPE_INITIAL; + case QUIC_TLS_ENC_LEVEL_EARLY_DATA: + return QUIC_PACKET_TYPE_0RTT; + case QUIC_TLS_ENC_LEVEL_HANDSHAKE: + return QUIC_PACKET_TYPE_HANDSHAKE; + case QUIC_TLS_ENC_LEVEL_APP: + return QUIC_PACKET_TYPE_SHORT; + default: + return -1; + } +} + +/* Return the packet type associated to <qel> encryption for <qc> QUIC connection, + * or -1 if not found. + */ +static inline enum quic_pkt_type quic_enc_level_pkt_type(struct quic_conn *qc, + struct quic_enc_level *qel) +{ + if (qel == qc->iel) + return QUIC_PACKET_TYPE_INITIAL; + else if (qel == qc->hel) + return QUIC_PACKET_TYPE_HANDSHAKE; + else if (qel == qc->eel) + return QUIC_PACKET_TYPE_0RTT; + else if (qel == qc->ael) + return QUIC_PACKET_TYPE_SHORT; + else + return -1; +} + +/* Derive the initial secrets with <ctx> as QUIC TLS context which is the + * cryptographic context for the first encryption level (Initial) from + * <cid> connection ID with <cidlen> as length (in bytes) for a server or not + * depending on <server> boolean value. + * Return 1 if succeeded or 0 if not. + */ +static inline int qc_new_isecs(struct quic_conn *qc, + struct quic_tls_ctx *ctx, const struct quic_version *ver, + const unsigned char *cid, size_t cidlen, int server) +{ + unsigned char initial_secret[32]; + /* Initial secret to be derived for incoming packets */ + unsigned char rx_init_sec[32]; + /* Initial secret to be derived for outgoing packets */ + unsigned char tx_init_sec[32]; + struct quic_tls_secrets *rx_ctx, *tx_ctx; + + TRACE_ENTER(QUIC_EV_CONN_ISEC); + if (!quic_initial_tls_ctx_init(ctx)) + goto err; + + if (!quic_derive_initial_secret(ctx->rx.md, + ver->initial_salt, ver->initial_salt_len, + initial_secret, sizeof initial_secret, + cid, cidlen)) + goto err; + + if (!quic_tls_derive_initial_secrets(ctx->rx.md, + rx_init_sec, sizeof rx_init_sec, + tx_init_sec, sizeof tx_init_sec, + initial_secret, sizeof initial_secret, server)) + goto err; + + rx_ctx = &ctx->rx; + tx_ctx = &ctx->tx; + if (!quic_tls_derive_keys(ctx->rx.aead, ctx->rx.hp, ctx->rx.md, ver, + rx_ctx->key, rx_ctx->keylen, + rx_ctx->iv, rx_ctx->ivlen, + rx_ctx->hp_key, sizeof rx_ctx->hp_key, + rx_init_sec, sizeof rx_init_sec)) + goto err; + + if (!quic_tls_rx_ctx_init(&rx_ctx->ctx, rx_ctx->aead, rx_ctx->key)) + goto err; + + if (!quic_tls_enc_aes_ctx_init(&rx_ctx->hp_ctx, rx_ctx->hp, rx_ctx->hp_key)) + goto err; + + if (!quic_tls_derive_keys(ctx->tx.aead, ctx->tx.hp, ctx->tx.md, ver, + tx_ctx->key, tx_ctx->keylen, + tx_ctx->iv, tx_ctx->ivlen, + tx_ctx->hp_key, sizeof tx_ctx->hp_key, + tx_init_sec, sizeof tx_init_sec)) + goto err; + + if (!quic_tls_tx_ctx_init(&tx_ctx->ctx, tx_ctx->aead, tx_ctx->key)) + goto err; + + if (!quic_tls_enc_aes_ctx_init(&tx_ctx->hp_ctx, tx_ctx->hp, tx_ctx->hp_key)) + goto err; + + TRACE_LEAVE(QUIC_EV_CONN_ISEC, qc, rx_init_sec, tx_init_sec); + + return 1; + + err: + TRACE_DEVEL("leaving in error", QUIC_EV_CONN_ISEC); + return 0; +} + +/* Reset all members of <tls_kp> to default values. */ +static inline void quic_tls_ku_reset(struct quic_tls_kp *tls_kp) +{ + tls_kp->ctx = NULL; + tls_kp->secret = NULL; + tls_kp->iv = NULL; + tls_kp->key = NULL; +} + +/* Release the memory allocated for all the key update key phase + * structures for <qc> QUIC connection. + * Always succeeds. + */ +static inline void quic_tls_ku_free(struct quic_conn *qc) +{ + EVP_CIPHER_CTX_free(qc->ku.prv_rx.ctx); + pool_free(pool_head_quic_tls_secret, qc->ku.prv_rx.secret); + pool_free(pool_head_quic_tls_iv, qc->ku.prv_rx.iv); + pool_free(pool_head_quic_tls_key, qc->ku.prv_rx.key); + quic_tls_ku_reset(&qc->ku.prv_rx); + EVP_CIPHER_CTX_free(qc->ku.nxt_rx.ctx); + pool_free(pool_head_quic_tls_secret, qc->ku.nxt_rx.secret); + pool_free(pool_head_quic_tls_iv, qc->ku.nxt_rx.iv); + pool_free(pool_head_quic_tls_key, qc->ku.nxt_rx.key); + quic_tls_ku_reset(&qc->ku.nxt_rx); + EVP_CIPHER_CTX_free(qc->ku.nxt_tx.ctx); + pool_free(pool_head_quic_tls_secret, qc->ku.nxt_tx.secret); + pool_free(pool_head_quic_tls_iv, qc->ku.nxt_tx.iv); + pool_free(pool_head_quic_tls_key, qc->ku.nxt_tx.key); + quic_tls_ku_reset(&qc->ku.nxt_tx); +} + +/* Initialize <kp> key update secrets, allocating the required memory. + * Return 1 if all the secrets could be allocated, 0 if not. + * This is the responsibility of the caller to release the memory + * allocated by this function in case of failure. + */ +static inline int quic_tls_kp_init(struct quic_tls_kp *kp) +{ + kp->count = 0; + kp->pn = 0; + kp->flags = 0; + kp->secret = pool_alloc(pool_head_quic_tls_secret); + kp->secretlen = QUIC_TLS_SECRET_LEN; + kp->iv = pool_alloc(pool_head_quic_tls_iv); + kp->ivlen = QUIC_TLS_IV_LEN; + kp->key = pool_alloc(pool_head_quic_tls_key); + kp->keylen = QUIC_TLS_KEY_LEN; + + return kp->secret && kp->iv && kp->key; +} + +/* Initialize all the key update key phase structures for <qc> + * QUIC connection, allocating the required memory. + * + * Returns 1 if succeeded, 0 if not. The caller is responsible to use + * quic_tls_ku_free() on error to cleanup partially allocated content. + */ +static inline int quic_tls_ku_init(struct quic_conn *qc) +{ + struct quic_tls_kp *prv_rx = &qc->ku.prv_rx; + struct quic_tls_kp *nxt_rx = &qc->ku.nxt_rx; + struct quic_tls_kp *nxt_tx = &qc->ku.nxt_tx; + + if (!quic_tls_kp_init(prv_rx) || + !quic_tls_kp_init(nxt_rx) || + !quic_tls_kp_init(nxt_tx)) + goto err; + + return 1; + + err: + return 0; +} + +/* Return 1 if <qel> has RX secrets, 0 if not. */ +static inline int quic_tls_has_rx_sec(const struct quic_enc_level *qel) +{ + return qel && !!qel->tls_ctx.rx.key; +} + +/* Return 1 if <qel> has TX secrets, 0 if not. */ +static inline int quic_tls_has_tx_sec(const struct quic_enc_level *qel) +{ + return qel && !!qel->tls_ctx.tx.key; +} + +/* Return 1 if there is RX packets for <qel> QUIC encryption level, 0 if not */ +static inline int qc_el_rx_pkts(struct quic_enc_level *qel) +{ + int ret; + + ret = !eb_is_empty(&qel->rx.pkts); + + return ret; +} + +/* Delete all RX packets for <qel> QUIC encryption level */ +static inline void qc_el_rx_pkts_del(struct quic_enc_level *qel) +{ + struct eb64_node *node; + + node = eb64_first(&qel->rx.pkts); + while (node) { + struct quic_rx_packet *pkt = + eb64_entry(node, struct quic_rx_packet, pn_node); + + node = eb64_next(node); + eb64_delete(&pkt->pn_node); + quic_rx_packet_refdec(pkt); + } +} + +static inline void qc_list_qel_rx_pkts(struct quic_enc_level *qel) +{ + struct eb64_node *node; + + node = eb64_first(&qel->rx.pkts); + while (node) { + struct quic_rx_packet *pkt; + + pkt = eb64_entry(node, struct quic_rx_packet, pn_node); + fprintf(stderr, "pkt@%p type=%d pn=%llu\n", + pkt, pkt->type, (ull)pkt->pn_node.key); + node = eb64_next(node); + } +} + +/* Returns a boolean if <qc> needs to emit frames for <qel> encryption level. */ +static inline int qc_need_sending(struct quic_conn *qc, struct quic_enc_level *qel) +{ + return (qc->flags & QUIC_FL_CONN_IMMEDIATE_CLOSE) || + (qel->pktns->flags & QUIC_FL_PKTNS_ACK_REQUIRED) || + qel->pktns->tx.pto_probe || + !LIST_ISEMPTY(&qel->pktns->tx.frms); +} + +/* Return 1 if <qc> connection may probe the Initial packet number space, 0 if not. + * This is not the case if the remote peer address is not validated and if + * it cannot send at least QUIC_INITIAL_PACKET_MINLEN bytes. + */ +static inline int qc_may_probe_ipktns(struct quic_conn *qc) +{ + return quic_peer_validated_addr(qc) || + quic_may_send_bytes(qc) >= QUIC_INITIAL_PACKET_MINLEN; +} + + + +#endif /* USE_QUIC */ +#endif /* _PROTO_QUIC_TLS_H */ + diff --git a/include/haproxy/quic_tp-t.h b/include/haproxy/quic_tp-t.h new file mode 100644 index 0000000..4897441 --- /dev/null +++ b/include/haproxy/quic_tp-t.h @@ -0,0 +1,118 @@ +#ifndef _HAPROXY_QUIC_TP_T_H +#define _HAPROXY_QUIC_TP_T_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <inttypes.h> +#include <sys/socket.h> +#include <netinet/in.h> + +#define QUIC_STATELESS_RESET_TOKEN_LEN 16 + +/* Default QUIC connection transport parameters */ +extern struct quic_transport_params quic_dflt_transport_params; + +struct tp_cid { + uint8_t len; + uint8_t data[20]; +}; + +struct tp_preferred_address { + uint16_t ipv4_port; + uint16_t ipv6_port; + struct in_addr ipv4_addr; + struct in6_addr ipv6_addr; + struct tp_cid cid; + uint8_t stateless_reset_token[QUIC_STATELESS_RESET_TOKEN_LEN]; +}; + +struct tp_version_information { + uint32_t chosen; + const struct quic_version *negotiated_version; +}; + +/* Default values for the absent transport parameters */ +#define QUIC_TP_DFLT_MAX_UDP_PAYLOAD_SIZE 65527 /* bytes */ +#define QUIC_TP_DFLT_ACK_DELAY_COMPONENT 3 /* milliseconds */ +#define QUIC_TP_DFLT_MAX_ACK_DELAY 25 /* milliseconds */ +#define QUIC_TP_DFLT_ACTIVE_CONNECTION_ID_LIMIT 2 /* number of connections */ +/* These ones are our implementation default values when not set + * by configuration + */ +#define QUIC_TP_DFLT_FRONT_MAX_IDLE_TIMEOUT 30000 /* milliseconds */ +#define QUIC_TP_DFLT_FRONT_MAX_STREAMS_BIDI 100 +#define QUIC_TP_DFLT_BACK_MAX_IDLE_TIMEOUT 30000 /* milliseconds */ + +/* Types of QUIC transport parameters */ +#define QUIC_TP_ORIGINAL_DESTINATION_CONNECTION_ID 0x00 +#define QUIC_TP_MAX_IDLE_TIMEOUT 0x01 +#define QUIC_TP_STATELESS_RESET_TOKEN 0x02 +#define QUIC_TP_MAX_UDP_PAYLOAD_SIZE 0x03 +#define QUIC_TP_INITIAL_MAX_DATA 0x04 +#define QUIC_TP_INITIAL_MAX_STREAM_DATA_BIDI_LOCAL 0x05 +#define QUIC_TP_INITIAL_MAX_STREAM_DATA_BIDI_REMOTE 0x06 +#define QUIC_TP_INITIAL_MAX_STREAM_DATA_UNI 0x07 +#define QUIC_TP_INITIAL_MAX_STREAMS_BIDI 0x08 +#define QUIC_TP_INITIAL_MAX_STREAMS_UNI 0x09 +#define QUIC_TP_ACK_DELAY_EXPONENT 0x0a +#define QUIC_TP_MAX_ACK_DELAY 0x0b +#define QUIC_TP_DISABLE_ACTIVE_MIGRATION 0x0c +#define QUIC_TP_PREFERRED_ADDRESS 0x0d +#define QUIC_TP_ACTIVE_CONNECTION_ID_LIMIT 0x0e +#define QUIC_TP_INITIAL_SOURCE_CONNECTION_ID 0x0f +#define QUIC_TP_RETRY_SOURCE_CONNECTION_ID 0x10 +#define QUIC_TP_VERSION_INFORMATION 0x11 + +/* + * These defines are not for transport parameter type, but the maximum accepted value for + * transport parameter types. + */ +#define QUIC_TP_ACK_DELAY_EXPONENT_LIMIT 20 +#define QUIC_TP_MAX_ACK_DELAY_LIMIT (1UL << 14) + +/* The maximum length of encoded transport parameters for any QUIC peer. */ +#define QUIC_TP_MAX_ENCLEN 128 +/* + * QUIC transport parameters. + * Note that forbidden parameters sent by clients MUST generate TRANSPORT_PARAMETER_ERROR errors. + */ +struct quic_transport_params { + uint64_t max_idle_timeout; + uint64_t max_udp_payload_size; /* Default: 65527 bytes (max of UDP payload for IPv6) */ + uint64_t initial_max_data; + uint64_t initial_max_stream_data_bidi_local; + uint64_t initial_max_stream_data_bidi_remote; + uint64_t initial_max_stream_data_uni; + uint64_t initial_max_streams_bidi; + uint64_t initial_max_streams_uni; + uint64_t ack_delay_exponent; /* Default: 3, max: 20 */ + uint64_t max_ack_delay; /* Default: 3ms, max: 2^14ms*/ + uint64_t active_connection_id_limit; + + /* Booleans */ + uint8_t disable_active_migration; + uint8_t with_stateless_reset_token; + uint8_t with_preferred_address; + uint8_t original_destination_connection_id_present; + uint8_t initial_source_connection_id_present; + + uint8_t stateless_reset_token[QUIC_STATELESS_RESET_TOKEN_LEN]; /* Forbidden for clients */ + /* + * MUST be sent by servers. + * When received by clients, must be set to 1 if present. + */ + struct tp_cid original_destination_connection_id; /* Forbidden for clients */ + /* + * MUST be sent by servers after Retry. + */ + struct tp_cid retry_source_connection_id; /* Forbidden for clients */ + /* MUST be present both for servers and clients. */ + struct tp_cid initial_source_connection_id; + struct tp_preferred_address preferred_address; /* Forbidden for clients */ + struct tp_version_information version_information; +}; + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_TP_T_H */ diff --git a/include/haproxy/quic_tp.h b/include/haproxy/quic_tp.h new file mode 100644 index 0000000..d3bdd18 --- /dev/null +++ b/include/haproxy/quic_tp.h @@ -0,0 +1,124 @@ +#ifndef _HAPROXY_QUIC_TP_H +#define _HAPROXY_QUIC_TP_H +#ifdef USE_QUIC +#ifndef USE_OPENSSL +#error "Must define USE_OPENSSL" +#endif + +#include <haproxy/chunk.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_tp-t.h> + +void quic_transport_params_init(struct quic_transport_params *p, int server); +int quic_transport_params_encode(unsigned char *buf, + const unsigned char *end, + struct quic_transport_params *p, + const struct quic_version *chosen_version, + int server); + +int quic_transport_params_store(struct quic_conn *conn, int server, + const unsigned char *buf, + const unsigned char *end); + +int qc_lstnr_params_init(struct quic_conn *qc, + const struct quic_transport_params *listener_params, + const unsigned char *stateless_reset_token, + const unsigned char *dcid, size_t dcidlen, + const unsigned char *scid, size_t scidlen, + const struct quic_cid *token_odcid); + +/* Dump <cid> transport parameter connection ID value if present (non null length). + * Used only for debugging purposes. + */ +static inline void quic_tp_cid_dump(struct buffer *buf, + const struct tp_cid *cid) +{ + int i; + + for (i = 0; i < cid->len; i++) + chunk_appendf(buf, "%02x", cid->data[i]); +} + +static inline void quic_tp_version_info_dump(struct buffer *b, + const struct tp_version_information *tp, int local) +{ + if (!tp->chosen) + return; + + chunk_appendf(b, " versions:chosen=0x%08x", tp->chosen); + if (tp->negotiated_version) + chunk_appendf(b, ",negotiated=0x%08x", tp->negotiated_version->num); +} + +static inline void quic_transport_params_dump(struct buffer *b, + const struct quic_conn *qc, + const struct quic_transport_params *p) +{ + int local = p == &qc->rx.params; + + if (p->original_destination_connection_id.len) { + chunk_appendf(b, " odcid="); + quic_tp_cid_dump(b, &p->original_destination_connection_id); + } + chunk_appendf(b, " iscid="); + quic_tp_cid_dump(b, &p->initial_source_connection_id); + if (p->retry_source_connection_id.len) { + chunk_appendf(b, " rscid="); + quic_tp_cid_dump(b, &p->retry_source_connection_id); + } + chunk_appendf(b, "\n"); + + chunk_appendf(b, " midle_timeout=%llums", (ull)p->max_idle_timeout); + chunk_appendf(b, " mudp_payload_sz=%llu", (ull)p->max_udp_payload_size); + chunk_appendf(b, " ack_delay_exp=%llu", (ull)p->ack_delay_exponent); + chunk_appendf(b, " mack_delay=%llums", (ull)p->max_ack_delay); + chunk_appendf(b, " act_cid_limit=%llu\n", (ull)p->active_connection_id_limit); + + chunk_appendf(b, " md=%llu", (ull)p->initial_max_data); + chunk_appendf(b, " msd_bidi_l=%llu", + (ull)p->initial_max_stream_data_bidi_local); + chunk_appendf(b, " msd_bidi_r=%llu", + (ull)p->initial_max_stream_data_bidi_remote); + chunk_appendf(b, " msd_uni=%llu", + (ull)p->initial_max_stream_data_uni); + chunk_appendf(b, " ms_bidi=%llu", (ull)p->initial_max_streams_bidi); + chunk_appendf(b, " ms_uni=%llu\n", (ull)p->initial_max_streams_uni); + + if (p->disable_active_migration || p->with_stateless_reset_token) { + int prev = 0; + + chunk_appendf(b, " ("); + if (p->disable_active_migration) { + if (prev) + chunk_appendf(b, ","); + prev = 1; + chunk_appendf(b, "no_act_migr"); + } + if (p->with_stateless_reset_token) { + if (prev) + chunk_appendf(b, ","); + prev = 1; + chunk_appendf(b, "stless_rst_tok"); + } + chunk_appendf(b, ")"); + } + + if (p->with_preferred_address) { + char bufaddr[INET6_ADDRSTRLEN]; + chunk_appendf(b, " pref_addr="); + inet_ntop(AF_INET, &p->preferred_address.ipv4_addr, + bufaddr, sizeof(bufaddr)); + chunk_appendf(b, "%s:%hu ", bufaddr, p->preferred_address.ipv4_port); + + inet_ntop(AF_INET6, &p->preferred_address.ipv6_addr, + bufaddr, sizeof(bufaddr)); + chunk_appendf(b, "[%s]:%hu ", bufaddr, p->preferred_address.ipv6_port); + quic_tp_cid_dump(b, &p->preferred_address.cid); + chunk_appendf(b, "\n"); + } + + quic_tp_version_info_dump(b, &p->version_information, local); +} + +#endif /* USE_QUIC */ +#endif /* _HAPROXY_QUIC_TP_H */ diff --git a/include/haproxy/quic_trace-t.h b/include/haproxy/quic_trace-t.h new file mode 100644 index 0000000..7ebc8a7 --- /dev/null +++ b/include/haproxy/quic_trace-t.h @@ -0,0 +1,103 @@ +/* + * include/haproxy/quic_trace-t.h + * Definitions for QUIC traces internal types, constants and flags. + * + * Copyright (C) 2023 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_QUIC_TRACE_T_H +#define _HAPROXY_QUIC_TRACE_T_H + +#include <haproxy/quic_tls-t.h> +#include <haproxy/trace-t.h> + +extern struct trace_source trace_quic; + +/* Used only for QUIC TLS key phase traces */ +struct quic_kp_trace { + const unsigned char *rx_sec; + size_t rx_seclen; + const struct quic_tls_kp *rx; + const unsigned char *tx_sec; + size_t tx_seclen; + const struct quic_tls_kp *tx; +}; + +/* Only for debug purpose */ +struct enc_debug_info { + unsigned char *payload; + size_t payload_len; + unsigned char *aad; + size_t aad_len; + uint64_t pn; +}; + +/* Structure to store enough information about the RX CRYPTO frames. */ +struct quic_rx_crypto_frm { + struct eb64_node offset_node; + uint64_t len; + const unsigned char *data; + struct quic_rx_packet *pkt; +}; + +#define QUIC_EV_CONN_NEW (1ULL << 0) +#define QUIC_EV_CONN_INIT (1ULL << 1) +#define QUIC_EV_CONN_ISEC (1ULL << 2) +#define QUIC_EV_CONN_RSEC (1ULL << 3) +#define QUIC_EV_CONN_WSEC (1ULL << 4) +#define QUIC_EV_CONN_RWSEC (1ULL << 5) +#define QUIC_EV_CONN_LPKT (1ULL << 6) +#define QUIC_EV_CONN_SPKT (1ULL << 7) +#define QUIC_EV_CONN_ENCPKT (1ULL << 8) +#define QUIC_EV_CONN_TXPKT (1ULL << 9) +#define QUIC_EV_CONN_PAPKT (1ULL << 10) +#define QUIC_EV_CONN_PAPKTS (1ULL << 11) +#define QUIC_EV_CONN_IO_CB (1ULL << 12) +#define QUIC_EV_CONN_RMHP (1ULL << 13) +#define QUIC_EV_CONN_PRSHPKT (1ULL << 14) +#define QUIC_EV_CONN_PRSAPKT (1ULL << 15) +#define QUIC_EV_CONN_PRSFRM (1ULL << 16) +#define QUIC_EV_CONN_PRSAFRM (1ULL << 17) +#define QUIC_EV_CONN_BFRM (1ULL << 18) +#define QUIC_EV_CONN_PHPKTS (1ULL << 19) +#define QUIC_EV_CONN_TRMHP (1ULL << 20) +#define QUIC_EV_CONN_ELRMHP (1ULL << 21) +#define QUIC_EV_CONN_RXPKT (1ULL << 22) +#define QUIC_EV_CONN_SSLDATA (1ULL << 23) +#define QUIC_EV_CONN_RXCDATA (1ULL << 24) +#define QUIC_EV_CONN_ADDDATA (1ULL << 25) +#define QUIC_EV_CONN_FFLIGHT (1ULL << 26) +#define QUIC_EV_CONN_SSLALERT (1ULL << 27) +#define QUIC_EV_CONN_PSTRM (1ULL << 28) +#define QUIC_EV_CONN_RTTUPDT (1ULL << 29) +#define QUIC_EV_CONN_CC (1ULL << 30) +#define QUIC_EV_CONN_SPPKTS (1ULL << 31) +#define QUIC_EV_CONN_PKTLOSS (1ULL << 32) +#define QUIC_EV_CONN_STIMER (1ULL << 33) +#define QUIC_EV_CONN_PTIMER (1ULL << 34) +#define QUIC_EV_CONN_SPTO (1ULL << 35) +#define QUIC_EV_CONN_BCFRMS (1ULL << 36) +#define QUIC_EV_CONN_XPRTSEND (1ULL << 37) +#define QUIC_EV_CONN_XPRTRECV (1ULL << 38) +#define QUIC_EV_CONN_FREED (1ULL << 39) +#define QUIC_EV_CONN_CLOSE (1ULL << 40) +#define QUIC_EV_CONN_ACKSTRM (1ULL << 41) +#define QUIC_EV_CONN_FRMLIST (1ULL << 42) +#define QUIC_EV_STATELESS_RST (1ULL << 43) +#define QUIC_EV_TRANSP_PARAMS (1ULL << 44) +#define QUIC_EV_CONN_IDLE_TIMER (1ULL << 45) +#define QUIC_EV_CONN_SUB (1ULL << 46) +#define QUIC_EV_CONN_ELEVELSEL (1ULL << 47) +#define QUIC_EV_CONN_RCV (1ULL << 48) +#define QUIC_EV_CONN_KILL (1ULL << 49) +#define QUIC_EV_CONN_KP (1ULL << 50) +#define QUIC_EV_CONN_SSL_COMPAT (1ULL << 51) +#define QUIC_EV_CONN_SET_AFFINITY (1ULL << 52) + +#endif /* _HAPROXY_QUIC_TRACE_T_H */ diff --git a/include/haproxy/quic_trace.h b/include/haproxy/quic_trace.h new file mode 100644 index 0000000..19fe864 --- /dev/null +++ b/include/haproxy/quic_trace.h @@ -0,0 +1,40 @@ +/* + * include/haproxy/quic_trace.h + * This file contains QUIC traces definitions. + * + * Copyright (C) 2023 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ +#ifndef _HAPROXY_QUIC_TRACE_H +#define _HAPROXY_QUIC_TRACE_H + +#include <haproxy/quic_trace-t.h> + +#define TRACE_SOURCE &trace_quic + +/* Initializes a enc_debug_info struct (only for debug purpose) */ +static inline void enc_debug_info_init(struct enc_debug_info *edi, + unsigned char *payload, size_t payload_len, + unsigned char *aad, size_t aad_len, uint64_t pn) +{ + edi->payload = payload; + edi->payload_len = payload_len; + edi->aad = aad; + edi->aad_len = aad_len; + edi->pn = pn; +} + +#endif /* _HAPROXY_QUIC_TRACE_H */ diff --git a/include/haproxy/quic_tx-t.h b/include/haproxy/quic_tx-t.h new file mode 100644 index 0000000..4653f04 --- /dev/null +++ b/include/haproxy/quic_tx-t.h @@ -0,0 +1,56 @@ +#ifndef _HAPROXY_TX_T_H +#define _HAPROXY_TX_T_H + +#define QUIC_MIN_CC_PKTSIZE 128 +#define QUIC_DGRAM_HEADLEN (sizeof(uint16_t) + sizeof(void *)) +#define QUIC_MAX_CC_BUFSIZE (2 * (QUIC_MIN_CC_PKTSIZE + QUIC_DGRAM_HEADLEN)) + +extern struct pool_head *pool_head_quic_tx_packet; +extern struct pool_head *pool_head_quic_cc_buf; + +/* Flag a sent packet as being an ack-eliciting packet. */ +#define QUIC_FL_TX_PACKET_ACK_ELICITING (1UL << 0) +/* Flag a sent packet as containing a PADDING frame. */ +#define QUIC_FL_TX_PACKET_PADDING (1UL << 1) +/* Flag a sent packet as being in flight. */ +#define QUIC_FL_TX_PACKET_IN_FLIGHT (QUIC_FL_TX_PACKET_ACK_ELICITING | QUIC_FL_TX_PACKET_PADDING) +/* Flag a sent packet as containing a CONNECTION_CLOSE frame */ +#define QUIC_FL_TX_PACKET_CC (1UL << 2) +/* Flag a sent packet as containing an ACK frame */ +#define QUIC_FL_TX_PACKET_ACK (1UL << 3) +/* Flag a sent packet as being coalesced to another one in the same datagram */ +#define QUIC_FL_TX_PACKET_COALESCED (1UL << 4) +/* Flag a sent packet as being probing with old data */ +#define QUIC_FL_TX_PACKET_PROBE_WITH_OLD_DATA (1UL << 5) + +/* Structure to store enough information about TX QUIC packets. */ +struct quic_tx_packet { + /* List entry point. */ + struct list list; + /* Packet length */ + size_t len; + /* This is not the packet length but the length of outstanding data + * for in flight TX packet. + */ + size_t in_flight_len; + struct eb64_node pn_node; + /* The list of frames of this packet. */ + struct list frms; + /* The time this packet was sent (ms). */ + unsigned int time_sent; + /* Packet number spakce. */ + struct quic_pktns *pktns; + /* Flags. */ + unsigned int flags; + /* Reference counter */ + int refcnt; + /* Next packet in the same datagram */ + struct quic_tx_packet *next; + /* Previous packet in the same datagram */ + struct quic_tx_packet *prev; + /* Largest acknowledged packet number if this packet contains an ACK frame */ + int64_t largest_acked_pn; + unsigned char type; +}; + +#endif /* _HAPROXY_TX_T_H */ diff --git a/include/haproxy/quic_tx.h b/include/haproxy/quic_tx.h new file mode 100644 index 0000000..0659c14 --- /dev/null +++ b/include/haproxy/quic_tx.h @@ -0,0 +1,92 @@ +/* + * QUIC protocol definitions (TX side). + * + * Copyright (C) 2023 + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_QUIC_TX_H +#define _HAPROXY_QUIC_TX_H + +#include <haproxy/buf-t.h> +#include <haproxy/list-t.h> +#include <haproxy/quic_conn-t.h> +#include <haproxy/quic_tls-t.h> +#include <haproxy/quic_rx-t.h> +#include <haproxy/quic_tx-t.h> + +struct buffer *qc_txb_alloc(struct quic_conn *qc); +void qc_txb_release(struct quic_conn *qc); +int qc_purge_txbuf(struct quic_conn *qc, struct buffer *buf); +struct buffer *qc_get_txb(struct quic_conn *qc); + +int qc_prep_hpkts(struct quic_conn *qc, struct buffer *buf, struct list *qels); +int qc_send_ppkts(struct buffer *buf, struct ssl_sock_ctx *ctx); +int qc_send_app_pkts(struct quic_conn *qc, struct list *frms); +int qc_dgrams_retransmit(struct quic_conn *qc); +void qc_prep_hdshk_fast_retrans(struct quic_conn *qc, + struct list *ifrms, struct list *hfrms); +int send_retry(int fd, struct sockaddr_storage *addr, + struct quic_rx_packet *pkt, const struct quic_version *qv); +int send_stateless_reset(struct listener *l, struct sockaddr_storage *dstaddr, + struct quic_rx_packet *rxpkt); +int send_version_negotiation(int fd, struct sockaddr_storage *addr, + struct quic_rx_packet *pkt); + +/* The TX packets sent in the same datagram are linked to each others in + * the order they are built. This function detach a packet from its successor + * and predecessor in the same datagram. + */ +static inline void quic_tx_packet_dgram_detach(struct quic_tx_packet *pkt) +{ + if (pkt->prev) + pkt->prev->next = pkt->next; + if (pkt->next) + pkt->next->prev = pkt->prev; +} + + +/* Increment the reference counter of <pkt> */ +static inline void quic_tx_packet_refinc(struct quic_tx_packet *pkt) +{ + pkt->refcnt++; +} + +/* Decrement the reference counter of <pkt> */ +static inline void quic_tx_packet_refdec(struct quic_tx_packet *pkt) +{ + if (--pkt->refcnt == 0) { + BUG_ON(!LIST_ISEMPTY(&pkt->frms)); + /* If there are others packet in the same datagram <pkt> is attached to, + * detach the previous one and the next one from <pkt>. + */ + quic_tx_packet_dgram_detach(pkt); + pool_free(pool_head_quic_tx_packet, pkt); + } +} + +/* Return the number of bytes which may be sent from <qc> connection when + * it has not already been validated. Note that this is the responsability + * of the caller to check that the case with quic_peer_validated_addr(). + * This latter BUG_ON() if 3 * qc->rx.bytes < qc->tx.prep_bytes. + */ +static inline size_t quic_may_send_bytes(struct quic_conn *qc) +{ + return 3 * qc->bytes.rx - qc->bytes.prep; +} + + +#endif /* _HAPROXY_QUIC_TX_H */ diff --git a/include/haproxy/receiver-t.h b/include/haproxy/receiver-t.h new file mode 100644 index 0000000..0ae441e --- /dev/null +++ b/include/haproxy/receiver-t.h @@ -0,0 +1,106 @@ +/* + * include/haproxy/receiver-t.h + * This file defines the structures needed to manage receivers. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_RECEIVER_T_H +#define _HAPROXY_RECEIVER_T_H + +#include <sys/types.h> +#include <sys/socket.h> + +#include <haproxy/api-t.h> +#include <haproxy/namespace-t.h> +#include <haproxy/proto_rhttp-t.h> +#include <haproxy/quic_sock-t.h> +#include <haproxy/thread.h> + +/* Bit values for receiver->flags */ +#define RX_F_BOUND 0x00000001 /* receiver already bound */ +#define RX_F_INHERITED 0x00000002 /* inherited FD from the parent process (fd@) or duped from another local receiver */ +#define RX_F_MWORKER 0x00000004 /* keep the FD open in the master but close it in the children */ +#define RX_F_MUST_DUP 0x00000008 /* this receiver's fd must be dup() from a reference; ignore socket-level ops here */ +#define RX_F_NON_SUSPENDABLE 0x00000010 /* this socket cannot be suspended hence must always be unbound */ + +/* Bit values for rx_settings->options */ +#define RX_O_FOREIGN 0x00000001 /* receives on foreign addresses */ +#define RX_O_V4V6 0x00000002 /* binds to both IPv4 and IPv6 addresses if !V6ONLY */ +#define RX_O_V6ONLY 0x00000004 /* binds to IPv6 addresses only */ + +/* All the settings that are used to configure a receiver */ +struct rx_settings { + struct { /* UNIX socket permissions */ + uid_t uid; /* -1 to leave unchanged */ + gid_t gid; /* -1 to leave unchanged */ + mode_t mode; /* 0 to leave unchanged */ + } ux; + char *interface; /* interface name or NULL */ + const struct netns_entry *netns; /* network namespace of the listener*/ + unsigned int options; /* receiver options (RX_O_*) */ + int shards; /* number of shards, 0=not set yet, -1="by-thread" */ +}; + +/* info about a shard that is shared between multiple groups. Receivers that + * are alone in their shard do not have a shard_info. + */ +struct shard_info { + uint nbgroups; /* number of groups in this shard (=#rx); Zero = unused. */ + uint nbthreads; /* number of threads in this shard (>=nbgroups) */ + ulong tgroup_mask; /* bitmask of thread groups having a member here */ + struct receiver *ref; /* first one, reference for FDs to duplicate */ + struct receiver *members[MAX_TGROUPS]; /* all members of the shard (one per thread group) */ +}; + +/* This describes a receiver with all its characteristics (address, options, etc) */ +struct receiver { + int fd; /* handle we receive from (fd only for now) */ + unsigned int flags; /* receiver options (RX_F_*) */ + struct protocol *proto; /* protocol this receiver belongs to */ + void *owner; /* receiver's owner (usually a listener) */ + void (*iocb)(int fd); /* generic I/O handler (typically accept callback) */ + unsigned long bind_thread; /* bitmask of threads allowed on this receiver */ + uint bind_tgroup; /* thread group ID: 0=global IDs, non-zero=local IDs */ + struct rx_settings *settings; /* points to the settings used by this receiver */ + struct shard_info *shard_info; /* points to info about the owning shard, NULL if single rx */ + struct list proto_list; /* list in the protocol header */ +#ifdef USE_QUIC + struct mt_list rxbuf_list; /* list of buffers to receive and dispatch QUIC datagrams. */ + enum quic_sock_mode quic_mode; /* QUIC socket allocation strategy */ + unsigned int quic_curr_handshake; /* count of active QUIC handshakes */ + unsigned int quic_curr_accept; /* count of QUIC conns waiting for accept */ +#endif + struct { + struct task *task; /* Task used to open connection for reverse. */ + struct server *srv; /* Underlying server used to initiate reverse pre-connect. */ + struct connection *pend_conn; /* Pending connection waiting to complete reversal before being accepted. */ + enum li_preconn_state state; /* State for transition logging. */ + } rhttp; + + /* warning: this struct is huge, keep it at the bottom */ + struct sockaddr_storage addr; /* the address the socket is bound to */ +}; + +#endif /* _HAPROXY_RECEIVER_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/regex-t.h b/include/haproxy/regex-t.h new file mode 100644 index 0000000..33d88a2 --- /dev/null +++ b/include/haproxy/regex-t.h @@ -0,0 +1,78 @@ +/* + * include/haproxy/regex-t.h + * Types and macros definitions for regular expressions + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_REGEX_T_H +#define _HAPROXY_REGEX_T_H + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/api.h> + +#ifdef USE_PCRE +#include <pcre.h> +#include <pcreposix.h> + +/* For pre-8.20 PCRE compatibility */ +#ifndef PCRE_STUDY_JIT_COMPILE +#define PCRE_STUDY_JIT_COMPILE 0 +#endif + +#elif defined(USE_PCRE2) +#include <pcre2.h> +#include <pcre2posix.h> + +#else /* no PCRE, nor PCRE2 */ +#include <regex.h> +#endif + +struct my_regex { +#ifdef USE_PCRE + pcre *reg; + pcre_extra *extra; +#ifdef USE_PCRE_JIT +#ifndef PCRE_CONFIG_JIT +#error "The PCRE lib doesn't support JIT. Change your lib, or remove the option USE_PCRE_JIT." +#endif +#endif +#elif defined(USE_PCRE2) + int(*mfn)(const pcre2_code *, PCRE2_SPTR, PCRE2_SIZE, PCRE2_SIZE, uint32_t, pcre2_match_data *, pcre2_match_context *); + pcre2_code *reg; +#else /* no PCRE */ + regex_t regex; +#endif +}; + +struct hdr_exp { + struct hdr_exp *next; + struct my_regex *preg; /* expression to look for */ + const char *replace; /* expression to set instead */ + void *cond; /* a possible condition or NULL */ +}; + +#endif /* _HAPROXY_REGEX_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/regex.h b/include/haproxy/regex.h new file mode 100644 index 0000000..2cd9573 --- /dev/null +++ b/include/haproxy/regex.h @@ -0,0 +1,144 @@ +/* + * include/haproxy/regex.h + * Compatibility layer for various regular expression engines + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_REGEX_H +#define _HAPROXY_REGEX_H + +#include <stdlib.h> +#include <string.h> + +#include <haproxy/api.h> +#include <haproxy/regex-t.h> + +extern THREAD_LOCAL regmatch_t pmatch[MAX_MATCH]; + +/* "str" is the string that contain the regex to compile. + * "regex" is preallocated memory. After the execution of this function, this + * struct contain the compiled regex. + * "cs" is the case sensitive flag. If cs is true, case sensitive is enabled. + * "cap" is capture flag. If cap if true the regex can capture into + * parenthesis strings. + * "err" is the standard error message pointer. + * + * The function return 1 is success case, else return 0 and err is filled. + */ +struct my_regex *regex_comp(const char *str, int cs, int cap, char **err); +int exp_replace(char *dst, unsigned int dst_size, char *src, const char *str, const regmatch_t *matches); +const char *check_replace_string(const char *str); +int regex_exec_match(const struct my_regex *preg, const char *subject, + size_t nmatch, regmatch_t pmatch[], int flags); +int regex_exec_match2(const struct my_regex *preg, char *subject, int length, + size_t nmatch, regmatch_t pmatch[], int flags); + + +/* If the function doesn't match, it returns false, else it returns true. + */ +static inline int regex_exec(const struct my_regex *preg, char *subject) +{ +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) + if (pcre_exec(preg->reg, preg->extra, subject, strlen(subject), 0, 0, NULL, 0) < 0) + return 0; + return 1; +#elif defined(USE_PCRE2) + pcre2_match_data *pm; + int ret; + + pm = pcre2_match_data_create_from_pattern(preg->reg, NULL); + ret = preg->mfn(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)strlen(subject), + 0, 0, pm, NULL); + pcre2_match_data_free(pm); + if (ret < 0) + return 0; + return 1; +#else + int match; + match = regexec(&preg->regex, subject, 0, NULL, 0); + if (match == REG_NOMATCH) + return 0; + return 1; +#endif +} + +/* Note that <subject> MUST be at least <length+1> characters long and must + * be writable because the function will temporarily force a zero past the + * last character. + * + * If the function doesn't match, it returns false, else it returns true. + */ +static inline int regex_exec2(const struct my_regex *preg, char *subject, int length) +{ +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) + if (pcre_exec(preg->reg, preg->extra, subject, length, 0, 0, NULL, 0) < 0) + return 0; + return 1; +#elif defined(USE_PCRE2) + pcre2_match_data *pm; + int ret; + + pm = pcre2_match_data_create_from_pattern(preg->reg, NULL); + ret = preg->mfn(preg->reg, (PCRE2_SPTR)subject, (PCRE2_SIZE)length, + 0, 0, pm, NULL); + pcre2_match_data_free(pm); + if (ret < 0) + return 0; + return 1; +#else + int match; + char old_char = subject[length]; + subject[length] = 0; + match = regexec(&preg->regex, subject, 0, NULL, 0); + subject[length] = old_char; + if (match == REG_NOMATCH) + return 0; + return 1; +#endif +} + +static inline void regex_free(struct my_regex *preg) +{ + if (!preg) + return; +#if defined(USE_PCRE) || defined(USE_PCRE_JIT) + pcre_free(preg->reg); +/* PCRE < 8.20 requires pcre_free() while >= 8.20 requires pcre_study_free(), + * which is easily detected using PCRE_CONFIG_JIT. + */ +#ifdef PCRE_CONFIG_JIT + pcre_free_study(preg->extra); +#else /* PCRE_CONFIG_JIT */ + pcre_free(preg->extra); +#endif /* PCRE_CONFIG_JIT */ +#elif defined(USE_PCRE2) || defined(USE_PCRE2_JIT) + pcre2_code_free(preg->reg); +#else + regfree(&preg->regex); +#endif + free(preg); +} + +#endif /* _HAPROXY_REGEX_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/resolvers-t.h b/include/haproxy/resolvers-t.h new file mode 100644 index 0000000..b727463 --- /dev/null +++ b/include/haproxy/resolvers-t.h @@ -0,0 +1,297 @@ +/* + * include/haproxy/dns-t.h + * This file provides structures and types for DNS. + * + * Copyright (C) 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_RESOLVERS_T_H +#define _HAPROXY_RESOLVERS_T_H + +#include <import/ebtree-t.h> + +#include <haproxy/connection-t.h> +#include <haproxy/dns-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/stats-t.h> +#include <haproxy/task-t.h> +#include <haproxy/thread.h> + +extern struct pool_head *resolv_requester_pool; + +/*DNS maximum values */ +/* + * Maximum issued from RFC: + * RFC 1035: https://www.ietf.org/rfc/rfc1035.txt chapter 2.3.4 + * RFC 2671: http://tools.ietf.org/html/rfc2671 + */ +#define DNS_MAX_LABEL_SIZE 63 +#define DNS_MAX_NAME_SIZE 255 +#define DNS_MAX_UDP_MESSAGE 65535 + +/* DNS minimum record size: 1 char + 1 NULL + type + class */ +#define DNS_MIN_RECORD_SIZE (1 + 1 + 2 + 2) + +/* DNS smallest fqdn 'a.gl' size */ +# define DNS_SMALLEST_FQDN_SIZE 4 + +/* maximum number of query records in a DNS response + * For now, we allow only one */ +#define DNS_MAX_QUERY_RECORDS 1 + +/* maximum number of answer record in a DNS response */ +#define DNS_MAX_ANSWER_RECORDS ((DNS_MAX_UDP_MESSAGE - DNS_HEADER_SIZE) / DNS_MIN_RECORD_SIZE) + +/* size of dns_buffer used to store responses from the buffer + * dns_buffer is used to store data collected from records found in a response. + * Before using it, caller will always check that there is at least DNS_MAX_NAME_SIZE bytes + * available */ +#define DNS_ANALYZE_BUFFER_SIZE DNS_MAX_UDP_MESSAGE + DNS_MAX_NAME_SIZE + +/* DNS error messages */ +#define DNS_TOO_LONG_FQDN "hostname too long" +#define DNS_LABEL_TOO_LONG "one label too long" +#define DNS_INVALID_CHARACTER "found an invalid character" + +/* dns query class */ +#define DNS_RCLASS_IN 1 /* internet class */ + +/* dns record types (non exhaustive list) */ +#define DNS_RTYPE_A 1 /* IPv4 address */ +#define DNS_RTYPE_CNAME 5 /* canonical name */ +#define DNS_RTYPE_AAAA 28 /* IPv6 address */ +#define DNS_RTYPE_SRV 33 /* SRV record */ +#define DNS_RTYPE_OPT 41 /* OPT */ +#define DNS_RTYPE_ANY 255 /* all records */ + +/* dns rcode values */ +#define DNS_RCODE_NO_ERROR 0 /* no error */ +#define DNS_RCODE_NX_DOMAIN 3 /* non existent domain */ +#define DNS_RCODE_REFUSED 5 /* query refused */ + +/* dns flags masks */ +#define DNS_FLAG_TRUNCATED 0x0200 /* mask for truncated flag */ +#define DNS_FLAG_REPLYCODE 0x000F /* mask for reply code */ + +/* max number of network preference entries are available from the + * configuration file. + */ +#define SRV_MAX_PREF_NET 5 + +/* NOTE: big endian structure */ +struct resolv_query_item { + char name[DNS_MAX_NAME_SIZE+1]; /* query name */ + unsigned short type; /* question type */ + unsigned short class; /* query class */ +}; + +/* NOTE: big endian structure */ +struct resolv_answer_item { + /*For SRV type, name also includes service and protocol value */ + char name[DNS_MAX_NAME_SIZE+1]; /* answer name */ + int16_t type; /* question type */ + int16_t class; /* query class */ + int32_t ttl; /* response TTL */ + int16_t priority; /* SRV type priority */ + uint16_t weight; /* SRV type weight */ + uint16_t port; /* SRV type port */ + uint16_t data_len; /* number of bytes in the <data> field below */ + struct eb32_node link; /* linking node */ + union { + struct sockaddr_in in4; /* IPv4 address for RTYPE_A */ + struct sockaddr_in6 in6; /* IPv6 address for RTYPE_AAAA */ + char target[DNS_MAX_NAME_SIZE+1]; /* Response data: SRV or CNAME type target */ + } data; + unsigned int last_seen; /* When was the answer was last seen */ + struct resolv_answer_item *ar_item; /* pointer to a RRset from the additional section, if exists */ + struct list attached_servers; /* attached server head */ +}; + +struct resolv_response { + struct dns_header header; + struct eb_root answer_tree; + /* authority ignored for now */ +}; + +/* Resolvers section and parameters. It is linked to the name servers + * servers points to it. + * current resolution are stored in a FIFO list. + */ +struct resolvers { + __decl_thread(HA_SPINLOCK_T lock); + unsigned int accepted_payload_size; /* maximum payload size we accept for responses */ + int nb_nameservers; /* total number of active nameservers in a resolvers section */ + int resolve_retries; /* number of retries before giving up */ + struct { /* time to: */ + int resolve; /* wait between 2 queries for the same resolution */ + int retry; /* wait for a response before retrying */ + } timeout; + struct { /* time to hold current data when */ + int valid; /* a response is valid */ + int nx; /* a response doesn't exist */ + int timeout; /* no answer was delivered */ + int refused; /* dns server refused to answer */ + int other; /* other dns response errors */ + int obsolete; /* an answer hasn't been seen */ + } hold; + struct task *t; /* timeout management */ + struct { + struct list wait; /* resolutions managed to this resolvers section */ + struct list curr; /* current running resolutions */ + } resolutions; + struct eb_root query_ids; /* tree to quickly lookup/retrieve query ids currently in use + * used by each nameserver, but stored in resolvers since there must + * be a unique relation between an eb_root and an eb_node (resolution) */ + struct list list; /* resolvers list */ + struct list nameservers; /* dns server list */ + struct proxy *px; /* px to handle connections to DNS servers */ + char *id; /* resolvers unique identifier */ + struct { + const char *file; /* file where the section appears */ + int line; /* line where the section appears */ + int implicit; /* config was auto-generated and must be silent */ + } conf; /* config information */ +}; + +struct resolv_options { + int family_prio; /* which IP family should the resolver use when both are returned */ + struct { + int family; + union { + struct in_addr in4; + struct in6_addr in6; + } addr; + union { + struct in_addr in4; + struct in6_addr in6; + } mask; + } pref_net[SRV_MAX_PREF_NET]; + int pref_net_nb; /* The number of registered preferred networks. */ + int accept_duplicate_ip; /* flag to indicate whether the associated object can use an IP address + already set to an other object of the same group */ + int ignore_weight; /* flag to indicate whether to ignore the weight within the record */ +}; + +/* Resolution structure associated to single server and used to manage name + * resolution for this server. + * The only link between the resolution and a nameserver is through the + * query_id. + */ +struct resolv_resolution { + struct resolvers *resolvers; /* pointer to the resolvers structure owning the resolution */ + struct list requesters; /* list of requesters using this resolution */ + int uuid; /* unique id (used for debugging purpose) */ + char *hostname_dn; /* server hostname in domain name label format */ + int hostname_dn_len; /* server domain name label len */ + unsigned int last_resolution; /* time of the last resolution */ + unsigned int last_query; /* time of the last query sent */ + unsigned int last_valid; /* time of the last valid response */ + int query_id; /* DNS query ID dedicated for this resolution */ + struct eb32_node qid; /* ebtree query id */ + int prefered_query_type; /* preferred query type */ + int query_type; /* current query type */ + int status; /* status of the resolution being processed RSLV_STATUS_* */ + int step; /* RSLV_STEP_* */ + int try; /* current resolution try */ + int nb_queries; /* count number of queries sent */ + int nb_responses; /* count number of responses received */ + + struct resolv_response response; /* structure hosting the DNS response */ + struct resolv_query_item response_query_records[DNS_MAX_QUERY_RECORDS]; /* <response> query records */ + + struct list list; /* resolution list */ +}; + +/* Structure used to describe the owner of a DNS resolution. */ +struct resolv_requester { + enum obj_type *owner; /* pointer to the owner (server or dns_srvrq) */ + struct resolv_resolution *resolution; /* pointer to the owned DNS resolution */ + + int (*requester_cb)(struct resolv_requester *, struct dns_counters *); /* requester callback for valid response */ + int (*requester_error_cb)(struct resolv_requester *, int); /* requester callback, for error management */ + + struct list list; /* requester list */ +}; + +/* Last resolution status code */ +enum { + RSLV_STATUS_NONE = 0, /* no resolution occurred yet */ + RSLV_STATUS_VALID, /* no error */ + RSLV_STATUS_INVALID, /* invalid responses */ + RSLV_STATUS_ERROR, /* error */ + RSLV_STATUS_NX, /* NXDOMAIN */ + RSLV_STATUS_REFUSED, /* server refused our query */ + RSLV_STATUS_TIMEOUT, /* no response from DNS servers */ + RSLV_STATUS_OTHER, /* other errors */ +}; + +/* Current resolution step */ +enum { + RSLV_STEP_NONE = 0, /* nothing happening currently */ + RSLV_STEP_RUNNING, /* resolution is running */ +}; + +/* Return codes after analyzing a DNS response */ +enum { + RSLV_RESP_VALID = 0, /* valid response */ + RSLV_RESP_INVALID, /* invalid response (various type of errors can trigger it) */ + RSLV_RESP_ERROR, /* DNS error code */ + RSLV_RESP_NX_DOMAIN, /* resolution unsuccessful */ + RSLV_RESP_REFUSED, /* DNS server refused to answer */ + RSLV_RESP_ANCOUNT_ZERO, /* no answers in the response */ + RSLV_RESP_WRONG_NAME, /* response does not match query name */ + RSLV_RESP_CNAME_ERROR, /* error when resolving a CNAME in an atomic response */ + RSLV_RESP_TIMEOUT, /* DNS server has not answered in time */ + RSLV_RESP_TRUNCATED, /* DNS response is truncated */ + RSLV_RESP_NO_EXPECTED_RECORD, /* No expected records were found in the response */ + RSLV_RESP_QUERY_COUNT_ERROR, /* we did not get the expected number of queries in the response */ + RSLV_RESP_INTERNAL, /* internal resolver error */ +}; + +/* Return codes after searching an IP in a DNS response buffer, using a family + * preference + */ +enum { + RSLV_UPD_NO = 1, /* provided IP was found and preference is matched + * OR provided IP found and preference is not matched, but no IP + * matching preference was found. + */ + RSLV_UPD_SRVIP_NOT_FOUND, /* provided IP not found + * OR provided IP found and preference is not match and an IP + * matching preference was found. + */ + RSLV_UPD_CNAME, /* CNAME without any IP provided in the response */ + RSLV_UPD_NAME_ERROR, /* name in the response did not match the query */ + RSLV_UPD_NO_IP_FOUND, /* no IP could be found in the response */ + RSLV_UPD_OBSOLETE_IP, /* The server IP was obsolete, and no other IP was found */ +}; + +struct proxy; +struct resolv_srvrq { + enum obj_type obj_type; /* object type == OBJ_TYPE_SRVRQ */ + struct resolvers *resolvers; /* pointer to the resolvers structure used for this server template */ + struct proxy *proxy; /* associated proxy */ + char *name; + char *hostname_dn; /* server hostname in Domain Name format */ + int hostname_dn_len; /* string length of the server hostname in Domain Name format */ + struct resolv_requester *requester; /* used to link to its DNS resolution */ + struct list attached_servers; /* List of the servers free to use */ + struct eb_root named_servers; /* tree of servers indexed by hostnames found in server state file */ + struct list list; /* Next SRV RQ for the same proxy */ +}; + +#endif /* _HAPROXY_RESOLVERS_T_H */ diff --git a/include/haproxy/resolvers.h b/include/haproxy/resolvers.h new file mode 100644 index 0000000..5d4c744 --- /dev/null +++ b/include/haproxy/resolvers.h @@ -0,0 +1,66 @@ +/* + * include/haproxy/dns.h + * This file provides functions related to DNS protocol + * + * Copyright (C) 2014 Baptiste Assmann <bedis9@gmail.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_RESOLVERS_H +#define _HAPROXY_RESOLVERS_H + +#include <haproxy/resolvers-t.h> + +struct proxy; +struct server; +struct stconn; +struct act_rule; +struct list; + +extern struct list sec_resolvers; +extern unsigned int resolv_failed_resolutions; + +struct resolvers *find_resolvers_by_id(const char *id); +struct resolv_srvrq *find_srvrq_by_name(const char *name, struct proxy *px); +struct resolv_srvrq *new_resolv_srvrq(struct server *srv, char *fqdn); +struct resolv_answer_item *find_srvrq_answer_record(const struct resolv_requester *requester); + +int resolv_str_to_dn_label(const char *str, int str_len, char *dn, int dn_len); +int resolv_dn_label_to_str(const char *dn, int dn_len, char *str, int str_len); + +int resolv_hostname_validation(const char *string, char **err); +int resolv_get_ip_from_response(struct resolv_response *r_res, + struct resolv_options *resolv_opts, void *currentip, + short currentip_sin_family, + void **newip, short *newip_sin_family, + struct server *owner); + +int resolv_link_resolution(void *requester, int requester_type, int requester_locked); +void resolv_unlink_resolution(struct resolv_requester *requester); +void resolv_detach_from_resolution_answer_items(struct resolv_resolution *res, struct resolv_requester *req); +void resolv_trigger_resolution(struct resolv_requester *requester); +enum act_parse_ret resolv_parse_do_resolve(const char **args, int *orig_arg, struct proxy *px, struct act_rule *rule, char **err); +int check_action_do_resolve(struct act_rule *rule, struct proxy *px, char **err); + +int stats_dump_resolvers(struct stconn *sc, + struct field *stats, size_t stats_count, + struct list *stat_modules); +void resolv_stats_clear_counters(int clrall, struct list *stat_modules); +int resolv_allocate_counters(struct list *stat_modules); +int dns_dgram_init(struct dns_nameserver *ns, struct sockaddr_storage *sk); +int resolvers_create_default(); + +#endif // _HAPROXY_RESOLVER_H diff --git a/include/haproxy/ring-t.h b/include/haproxy/ring-t.h new file mode 100644 index 0000000..b89c886 --- /dev/null +++ b/include/haproxy/ring-t.h @@ -0,0 +1,113 @@ +/* + * include/haproxy/ring-t.h + * This file provides definitions for ring buffers used for disposable data. + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_RING_T_H +#define _HAPROXY_RING_T_H + +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/thread.h> + +/* The code below handles circular buffers with single-producer and multiple + * readers (up to 255). The buffer storage area must remain always allocated. + * It's made of series of payload blocks followed by a readers count (RC). + * There is always a readers count at the beginning of the buffer as well. Each + * payload block is composed of a varint-encoded size (VI) followed by the + * actual payload (PL). + * + * The readers count is encoded on a single byte. It indicates how many readers + * are still waiting at this position. The writer writes after the buffer's + * tail, which initially starts just past the first readers count. Then it + * knows by reading this count that it must wake up the readers to indicate + * data availability. When a reader reads the payload block, it increments the + * next readers count and decrements the current one. The area between the + * initial readers count and the next one is protected from overwriting for as + * long as the initial count is non-null. As such these readers count are + * effective barriers against data recycling. + * + * Only the writer is allowed to update the buffer's tail/head. This ensures + * that events can remain as long as possible so that late readers can get the + * maximum history available. It also helps dealing with multi-thread accesses + * using a simple RW lock during the buffer head's manipulation. The writer + * will have to delete some old records starting at the head until the new + * message can fit or a non-null readers count is encountered. If a message + * cannot fit due to insufficient room, the message is lost and the drop + * counted must be incremented. + * + * Like any buffer, this buffer naturally wraps at the end and continues at the + * beginning. The creation process consists in immediately adding a null + * readers count byte into the buffer. The write process consists in always + * writing a payload block followed by a new readers count. The delete process + * consists in removing a null readers count and payload block. As such, there + * is always at least one readers count byte in the buffer available at the + * head for new readers to attach to, and one before the tail, both of which + * may be the same when the buffer doesn't contain any event. It is thus safe + * for any reader to simply keep the absolute offset of the last visited + * position and to restart from there. The write will update the buffer's + * absolute offset when deleting entries. All this also has the benefit of + * allowing a buffer to be hot-resized without losing its contents. + * + * Thus we have this : + * - init of empty buffer: + * head-, ,-tail + * [ RC | xxxxxxxxxxxxxxxxxxxxxxxxxx ] + * + * - reader attached: + * head-, ,-tail + * [ RC | xxxxxxxxxxxxxxxxxxxxxxxxxx ] + * ^- +1 + * + * - append of one event: + * appended + * head-, <----------> ,-tail + * [ RC | VI | PL | RC | xxxxxxxxxxx ] + * + * - reader advancing: + * head-, ,-tail + * [ RC | VI | PL | RC | xxxxxxxxxxx ] + * ^- -1 ^- +1 + * + * - writer removing older message: + * head-, ,-tail + * [ xxxxxxxxxxxx | RC | xxxxxxxxxxx ] + * <----------> + * removed + */ + +/* ring watch flags to be used when watching the ring */ +#define RING_WF_WAIT_MODE 0x00000001 /* wait for new contents */ +#define RING_WF_SEEK_NEW 0x00000002 /* seek to new contents */ + +struct ring { + struct buffer buf; // storage area + struct list waiters; // list of waiters, for now, CLI "show event" + __decl_thread(HA_RWLOCK_T lock); + int readers_count; +}; + +#endif /* _HAPROXY_RING_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/ring.h b/include/haproxy/ring.h new file mode 100644 index 0000000..71217d5 --- /dev/null +++ b/include/haproxy/ring.h @@ -0,0 +1,53 @@ +/* + * include/haproxy/ring.h + * Exported functions for ring buffers used for disposable data. + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_RING_H +#define _HAPROXY_RING_H + +#include <stdlib.h> +#include <import/ist.h> +#include <haproxy/ring-t.h> + +struct appctx; + +struct ring *ring_new(size_t size); +struct ring *ring_make_from_area(void *area, size_t size); +struct ring *ring_cast_from_area(void *area); +void ring_init(struct ring *ring, void* area, size_t size); +struct ring *ring_resize(struct ring *ring, size_t size); +void ring_free(struct ring *ring); +ssize_t ring_write(struct ring *ring, size_t maxlen, const struct ist pfx[], size_t npfx, const struct ist msg[], size_t nmsg); +int ring_attach(struct ring *ring); +void ring_detach_appctx(struct ring *ring, struct appctx *appctx, size_t ofs); +int ring_attach_cli(struct ring *ring, struct appctx *appctx, uint flags); +int cli_io_handler_show_ring(struct appctx *appctx); +void cli_io_release_show_ring(struct appctx *appctx); + +size_t ring_max_payload(const struct ring *ring); + +#endif /* _HAPROXY_RING_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/sample-t.h b/include/haproxy/sample-t.h new file mode 100644 index 0000000..27cf4ba --- /dev/null +++ b/include/haproxy/sample-t.h @@ -0,0 +1,315 @@ +/* + * include/haproxy/sample-t.h + * Macros, variables and structures for sample management. + * + * Copyright (C) 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2012-2013 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SAMPLE_T_H +#define _HAPROXY_SAMPLE_T_H + +#include <haproxy/api-t.h> +#include <haproxy/sample_data-t.h> + +/* input and output sample types + * + * Some of them are pseudo types which means that they can be used for + * in_type and out_type in sample (fetches/conv) definitions (they serve as + * compatibility and conversion hints) but they cannot be emitted at runtime. + */ +enum { + SMP_T_ANY = 0, /* pseudo type: any type */ + SMP_T_SAME, /* special: output type hint for converters that don't alter input type (out == in) */ + SMP_T_BOOL, /* boolean */ + SMP_T_SINT, /* signed 64bits integer type */ + SMP_T_ADDR, /* pseudo type: could be ipv4 or ipv6 */ + SMP_T_IPV4, /* ipv4 type */ + SMP_T_IPV6, /* ipv6 type */ + SMP_T_STR, /* char string type */ + SMP_T_BIN, /* buffer type */ + SMP_T_METH, /* contain method */ + SMP_TYPES /* number of types, must always be last */ +}; + +/* Sample sources are used to establish a relation between fetch keywords and + * the location where they're about to be used. They're reserved for internal + * use and are not meant to be known outside the sample management code. + */ +enum { + SMP_SRC_CONST, /* constat elements known at configuration time */ + SMP_SRC_INTRN, /* internal context-less information */ + SMP_SRC_LISTN, /* listener which accepted the connection */ + SMP_SRC_FTEND, /* frontend which accepted the connection */ + SMP_SRC_L4CLI, /* L4 information about the client */ + SMP_SRC_L5CLI, /* fetch uses client information from embryonic session */ + SMP_SRC_TRACK, /* fetch involves track counters */ + SMP_SRC_L6REQ, /* fetch uses raw information from the request buffer */ + SMP_SRC_HRQHV, /* fetch uses volatile information about HTTP request headers (eg: value) */ + SMP_SRC_HRQHP, /* fetch uses persistent information about HTTP request headers (eg: meth) */ + SMP_SRC_HRQBO, /* fetch uses information about HTTP request body */ + SMP_SRC_BKEND, /* fetch uses information about the backend */ + SMP_SRC_SERVR, /* fetch uses information about the selected server */ + SMP_SRC_L4SRV, /* fetch uses information about the server L4 connection */ + SMP_SRC_L5SRV, /* fetch uses information about the server L5 connection */ + SMP_SRC_L6RES, /* fetch uses raw information from the response buffer */ + SMP_SRC_HRSHV, /* fetch uses volatile information about HTTP response headers (eg: value) */ + SMP_SRC_HRSHP, /* fetch uses persistent information about HTTP response headers (eg: status) */ + SMP_SRC_HRSBO, /* fetch uses information about HTTP response body */ + SMP_SRC_RQFIN, /* final information about request buffer (eg: tot bytes) */ + SMP_SRC_RSFIN, /* final information about response buffer (eg: tot bytes) */ + SMP_SRC_TXFIN, /* final information about the transaction (eg: #comp rate) */ + SMP_SRC_SSFIN, /* final information about the stream (eg: #requests, final flags) */ + SMP_SRC_ENTRIES /* nothing after this */ +}; + +/* Sample checkpoints are a list of places where samples may be used. This is + * an internal enum used only to build SMP_VAL_*. + */ +enum { + SMP_CKP_FE_CON_ACC, /* FE connection accept rules ("tcp request connection") */ + SMP_CKP_FE_SES_ACC, /* FE stream accept rules (to come soon) */ + SMP_CKP_FE_REQ_CNT, /* FE request content rules ("tcp request content") */ + SMP_CKP_FE_HRQ_HDR, /* FE HTTP request headers (rules, headers, monitor, stats, redirect) */ + SMP_CKP_FE_HRQ_BDY, /* FE HTTP request body */ + SMP_CKP_FE_SET_BCK, /* FE backend switching rules ("use_backend") */ + SMP_CKP_BE_REQ_CNT, /* BE request content rules ("tcp request content") */ + SMP_CKP_BE_HRQ_HDR, /* BE HTTP request headers (rules, headers, monitor, stats, redirect) */ + SMP_CKP_BE_HRQ_BDY, /* BE HTTP request body */ + SMP_CKP_BE_SET_SRV, /* BE server switching rules ("use_server", "balance", "force-persist", "stick", ...) */ + SMP_CKP_BE_SRV_CON, /* BE server connect (eg: "source") */ + SMP_CKP_BE_RES_CNT, /* BE response content rules ("tcp response content") */ + SMP_CKP_BE_HRS_HDR, /* BE HTTP response headers (rules, headers) */ + SMP_CKP_BE_HRS_BDY, /* BE HTTP response body (stick-store rules are there) */ + SMP_CKP_BE_STO_RUL, /* BE stick-store rules */ + SMP_CKP_FE_RES_CNT, /* FE response content rules ("tcp response content") */ + SMP_CKP_FE_HRS_HDR, /* FE HTTP response headers (rules, headers) */ + SMP_CKP_FE_HRS_BDY, /* FE HTTP response body */ + SMP_CKP_FE_LOG_END, /* FE log at the end of the txn/stream */ + SMP_CKP_BE_CHK_RUL, /* BE tcp-check rules */ + SMP_CKP_CFG_PARSER, /* config parser (i.e. before boot) */ + SMP_CKP_CLI_PARSER, /* command line parser */ + SMP_CKP_ENTRIES /* nothing after this */ +}; + +/* SMP_USE_* are flags used to declare fetch keywords. Fetch methods are + * associated with bitfields composed of these values, generally only one, to + * indicate where the contents may be sampled. Some fetches are ambiguous as + * they apply to either the request or the response depending on the context, + * so they will have 2 of these bits (eg: hdr(), payload(), ...). These are + * stored in smp->use. + */ +enum { + SMP_USE_CONST = 1 << SMP_SRC_CONST, /* constant values known at config time */ + SMP_USE_INTRN = 1 << SMP_SRC_INTRN, /* internal context-less information */ + SMP_USE_LISTN = 1 << SMP_SRC_LISTN, /* listener which accepted the connection */ + SMP_USE_FTEND = 1 << SMP_SRC_FTEND, /* frontend which accepted the connection */ + SMP_USE_L4CLI = 1 << SMP_SRC_L4CLI, /* L4 information about the client */ + SMP_USE_L5CLI = 1 << SMP_SRC_L5CLI, /* fetch uses client information from embryonic session */ + SMP_USE_TRACK = 1 << SMP_SRC_TRACK, /* fetch involves track counters */ + SMP_USE_L6REQ = 1 << SMP_SRC_L6REQ, /* fetch uses raw information from the request buffer */ + SMP_USE_HRQHV = 1 << SMP_SRC_HRQHV, /* fetch uses volatile information about HTTP request headers (eg: value) */ + SMP_USE_HRQHP = 1 << SMP_SRC_HRQHP, /* fetch uses persistent information about HTTP request headers (eg: meth) */ + SMP_USE_HRQBO = 1 << SMP_SRC_HRQBO, /* fetch uses information about HTTP request body */ + SMP_USE_BKEND = 1 << SMP_SRC_BKEND, /* fetch uses information about the backend */ + SMP_USE_SERVR = 1 << SMP_SRC_SERVR, /* fetch uses information about the selected server */ + SMP_USE_L4SRV = 1 << SMP_SRC_L4SRV, /* fetch uses information about the server L4 connection */ + SMP_USE_L5SRV = 1 << SMP_SRC_L5SRV, /* fetch uses information about the server L5 connection */ + SMP_USE_L6RES = 1 << SMP_SRC_L6RES, /* fetch uses raw information from the response buffer */ + SMP_USE_HRSHV = 1 << SMP_SRC_HRSHV, /* fetch uses volatile information about HTTP response headers (eg: value) */ + SMP_USE_HRSHP = 1 << SMP_SRC_HRSHP, /* fetch uses persistent information about HTTP response headers (eg: status) */ + SMP_USE_HRSBO = 1 << SMP_SRC_HRSBO, /* fetch uses information about HTTP response body */ + SMP_USE_RQFIN = 1 << SMP_SRC_RQFIN, /* final information about request buffer (eg: tot bytes) */ + SMP_USE_RSFIN = 1 << SMP_SRC_RSFIN, /* final information about response buffer (eg: tot bytes) */ + SMP_USE_TXFIN = 1 << SMP_SRC_TXFIN, /* final information about the transaction (eg: #comp rate) */ + SMP_USE_SSFIN = 1 << SMP_SRC_SSFIN, /* final information about the stream (eg: #requests, final flags) */ + + /* This composite one is useful to detect if an http_txn needs to be allocated */ + SMP_USE_HTTP_ANY = SMP_USE_HRQHV | SMP_USE_HRQHP | SMP_USE_HRQBO | + SMP_USE_HRSHV | SMP_USE_HRSHP | SMP_USE_HRSBO, +}; + +/* Sample validity is computed from the fetch sources above when keywords + * are registered. Each fetch method may be used at different locations. The + * configuration parser will check whether the fetches are compatible with the + * location where they're used. These are stored in smp->val. + */ +enum { + SMP_VAL___________ = 0, /* Just used as a visual marker */ + SMP_VAL_FE_CON_ACC = 1 << SMP_CKP_FE_CON_ACC, /* FE connection accept rules ("tcp request connection") */ + SMP_VAL_FE_SES_ACC = 1 << SMP_CKP_FE_SES_ACC, /* FE stream accept rules (to come soon) */ + SMP_VAL_FE_REQ_CNT = 1 << SMP_CKP_FE_REQ_CNT, /* FE request content rules ("tcp request content") */ + SMP_VAL_FE_HRQ_HDR = 1 << SMP_CKP_FE_HRQ_HDR, /* FE HTTP request headers (rules, headers, monitor, stats, redirect) */ + SMP_VAL_FE_HRQ_BDY = 1 << SMP_CKP_FE_HRQ_BDY, /* FE HTTP request body */ + SMP_VAL_FE_SET_BCK = 1 << SMP_CKP_FE_SET_BCK, /* FE backend switching rules ("use_backend") */ + SMP_VAL_BE_REQ_CNT = 1 << SMP_CKP_BE_REQ_CNT, /* BE request content rules ("tcp request content") */ + SMP_VAL_BE_HRQ_HDR = 1 << SMP_CKP_BE_HRQ_HDR, /* BE HTTP request headers (rules, headers, monitor, stats, redirect) */ + SMP_VAL_BE_HRQ_BDY = 1 << SMP_CKP_BE_HRQ_BDY, /* BE HTTP request body */ + SMP_VAL_BE_SET_SRV = 1 << SMP_CKP_BE_SET_SRV, /* BE server switching rules ("use_server", "balance", "force-persist", "stick", ...) */ + SMP_VAL_BE_SRV_CON = 1 << SMP_CKP_BE_SRV_CON, /* BE server connect (eg: "source") */ + SMP_VAL_BE_RES_CNT = 1 << SMP_CKP_BE_RES_CNT, /* BE response content rules ("tcp response content") */ + SMP_VAL_BE_HRS_HDR = 1 << SMP_CKP_BE_HRS_HDR, /* BE HTTP response headers (rules, headers) */ + SMP_VAL_BE_HRS_BDY = 1 << SMP_CKP_BE_HRS_BDY, /* BE HTTP response body (stick-store rules are there) */ + SMP_VAL_BE_STO_RUL = 1 << SMP_CKP_BE_STO_RUL, /* BE stick-store rules */ + SMP_VAL_FE_RES_CNT = 1 << SMP_CKP_FE_RES_CNT, /* FE response content rules ("tcp response content") */ + SMP_VAL_FE_HRS_HDR = 1 << SMP_CKP_FE_HRS_HDR, /* FE HTTP response headers (rules, headers) */ + SMP_VAL_FE_HRS_BDY = 1 << SMP_CKP_FE_HRS_BDY, /* FE HTTP response body */ + SMP_VAL_FE_LOG_END = 1 << SMP_CKP_FE_LOG_END, /* FE log at the end of the txn/stream */ + SMP_VAL_BE_CHK_RUL = 1 << SMP_CKP_BE_CHK_RUL, /* BE tcp-check rule */ + SMP_VAL_CFG_PARSER = 1 << SMP_CKP_CFG_PARSER, /* within config parser */ + SMP_VAL_CLI_PARSER = 1 << SMP_CKP_CLI_PARSER, /* within command line parser */ + + /* a few combinations to decide what direction to try to fetch (useful for logs) */ + SMP_VAL_REQUEST = SMP_VAL_FE_CON_ACC | SMP_VAL_FE_SES_ACC | SMP_VAL_FE_REQ_CNT | + SMP_VAL_FE_HRQ_HDR | SMP_VAL_FE_HRQ_BDY | SMP_VAL_FE_SET_BCK | + SMP_VAL_BE_REQ_CNT | SMP_VAL_BE_HRQ_HDR | SMP_VAL_BE_HRQ_BDY | + SMP_VAL_BE_SET_SRV | SMP_VAL_BE_CHK_RUL, + + SMP_VAL_RESPONSE = SMP_VAL_BE_SRV_CON | SMP_VAL_BE_RES_CNT | SMP_VAL_BE_HRS_HDR | + SMP_VAL_BE_HRS_BDY | SMP_VAL_BE_STO_RUL | SMP_VAL_FE_RES_CNT | + SMP_VAL_FE_HRS_HDR | SMP_VAL_FE_HRS_BDY | SMP_VAL_FE_LOG_END | + SMP_VAL_BE_CHK_RUL, +}; + +/* Sample fetch options are passed to sample fetch functions to add precision + * about what is desired : + * - fetch direction (req/resp) + * - intermediary / final fetch + */ +enum { + SMP_OPT_DIR_REQ = 0, /* direction = request */ + SMP_OPT_DIR_RES = 1, /* direction = response */ + SMP_OPT_DIR = (SMP_OPT_DIR_REQ|SMP_OPT_DIR_RES), /* mask to get direction */ + SMP_OPT_FINAL = 2, /* final fetch, contents won't change anymore */ + SMP_OPT_ITERATE = 4, /* fetches may be iterated if supported (for ACLs) */ +}; + +/* Flags used to describe fetched samples. MAY_CHANGE indicates that the result + * of the fetch might still evolve, for instance because of more data expected, + * even if the fetch has failed. VOL_* indicates how long a result may be cached. + */ +enum { + SMP_F_NOT_LAST = 1 << 0, /* other occurrences might exist for this sample */ + SMP_F_MAY_CHANGE = 1 << 1, /* sample is unstable and might change (eg: request length) */ + SMP_F_VOL_TEST = 1 << 2, /* result must not survive longer than the test (eg: time) */ + SMP_F_VOL_1ST = 1 << 3, /* result sensitive to changes in first line (eg: URI) */ + SMP_F_VOL_HDR = 1 << 4, /* result sensitive to changes in headers */ + SMP_F_VOL_TXN = 1 << 5, /* result sensitive to new transaction (eg: HTTP version) */ + SMP_F_VOL_SESS = 1 << 6, /* result sensitive to new session (eg: src IP) */ + SMP_F_VOLATILE = (1<<2)|(1<<3)|(1<<4)|(1<<5)|(1<<6), /* any volatility condition */ + SMP_F_CONST = 1 << 7, /* This sample use constant memory. May diplicate it before changes */ +}; + +/* needed below */ +struct session; +struct stream; +struct arg; + +/* a sample context might be used by any sample fetch function in order to + * store information needed across multiple calls (eg: restart point for a + * next occurrence). By definition it may store up to 8 pointers, or any + * scalar (double, int, long long). + */ +union smp_ctx { + void *p; /* any pointer */ + int i; /* any integer */ + long long ll; /* any long long or smaller */ + double d; /* any float or double */ + void *a[8]; /* any array of up to 8 pointers */ +}; + +/* a sample is a typed data extracted from a stream. It has a type, contents, + * validity constraints, a context for use in iterative calls. + */ +struct sample { + unsigned int flags; /* SMP_F_* */ + struct sample_data data; + union smp_ctx ctx; + + /* Some sample analyzer (sample-fetch or converters) needs to + * known the attached proxy, session and stream. The sample-fetches + * and the converters function pointers cannot be called without + * these 3 pointers filled. + */ + struct proxy *px; + struct session *sess; + struct stream *strm; /* WARNING! MAY BE NULL! (eg: tcp-request connection) */ + unsigned int opt; /* fetch options (SMP_OPT_*) */ +}; + +/* Descriptor for a sample conversion */ +struct sample_conv { + const char *kw; /* configuration keyword */ + int (*process)(const struct arg *arg_p, + struct sample *smp, + void *private); /* process function */ + uint64_t arg_mask; /* arguments (ARG*()) */ + int (*val_args)(struct arg *arg_p, + struct sample_conv *smp_conv, + const char *file, int line, + char **err_msg); /* argument validation function */ + unsigned int in_type; /* expected input sample type */ + unsigned int out_type; /* output sample type */ + void *private; /* private values. only used by maps and Lua */ +}; + +/* sample conversion expression */ +struct sample_conv_expr { + struct list list; /* member of a sample_expr */ + struct sample_conv *conv; /* sample conversion used */ + struct arg *arg_p; /* optional arguments */ +}; + +/* Descriptor for a sample fetch method */ +struct sample_fetch { + const char *kw; /* configuration keyword */ + int (*process)(const struct arg *arg_p, + struct sample *smp, + const char *kw, /* fetch processing function */ + void *private); /* private value. */ + uint64_t arg_mask; /* arguments (ARG*()) */ + int (*val_args)(struct arg *arg_p, + char **err_msg); /* argument validation function */ + unsigned long out_type; /* output sample type */ + unsigned int use; /* fetch source (SMP_USE_*) */ + unsigned int val; /* fetch validity (SMP_VAL_*) */ + void *private; /* private values. only used by Lua */ +}; + +/* sample expression */ +struct sample_expr { + struct list list; /* member of list of sample, currently not used */ + struct sample_fetch *fetch; /* sample fetch method */ + struct arg *arg_p; /* optional pointer to arguments to fetch function */ + struct list conv_exprs; /* list of conversion expression to apply */ +}; + +/* sample fetch keywords list */ +struct sample_fetch_kw_list { + struct list list; /* head of sample fetch keyword list */ + struct sample_fetch kw[VAR_ARRAY]; /* array of sample fetch descriptors */ +}; + +/* sample conversion keywords list */ +struct sample_conv_kw_list { + struct list list; /* head of sample conversion keyword list */ + struct sample_conv kw[VAR_ARRAY]; /* array of sample conversion descriptors */ +}; + +typedef int (*sample_cast_fct)(struct sample *smp); + +#endif /* _HAPROXY_SAMPLE_T_H */ diff --git a/include/haproxy/sample.h b/include/haproxy/sample.h new file mode 100644 index 0000000..7e05e78 --- /dev/null +++ b/include/haproxy/sample.h @@ -0,0 +1,186 @@ +/* + * include/haproxy/sample.h + * Functions for samples management. + * + * Copyright (C) 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2012 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SAMPLE_H +#define _HAPROXY_SAMPLE_H + +#include <haproxy/api.h> +#include <haproxy/arg-t.h> +#include <haproxy/sample-t.h> +#include <haproxy/stick_table-t.h> + +extern sample_cast_fct sample_casts[SMP_TYPES][SMP_TYPES]; +extern const unsigned int fetch_cap[SMP_SRC_ENTRIES]; +extern const char *smp_to_type[SMP_TYPES]; + +struct sample_expr *sample_parse_expr(char **str, int *idx, const char *file, int line, char **err, struct arg_list *al, char **endptr); +int sample_parse_expr_cnv(char **str, int *idx, char **endptr, char **err_msg, struct arg_list *al, const char *file, int line, + struct sample_expr *expr, const char *start); +struct sample_conv *find_sample_conv(const char *kw, int len); +struct sample *sample_process(struct proxy *px, struct session *sess, + struct stream *strm, unsigned int opt, + struct sample_expr *expr, struct sample *p); +int sample_process_cnv(struct sample_expr *expr, struct sample *p); +struct sample *sample_fetch_as_type(struct proxy *px, struct session *sess, + struct stream *strm, unsigned int opt, + struct sample_expr *expr, int smp_type); +int sample_conv_var2smp(const struct var_desc *var, struct sample *smp, int type); +int sample_conv_var2smp_sint(const struct arg *arg, struct sample *smp); +int sample_conv_var2smp_str(const struct arg *arg, struct sample *smp); +void release_sample_expr(struct sample_expr *expr); +void sample_register_fetches(struct sample_fetch_kw_list *psl); +void sample_register_convs(struct sample_conv_kw_list *psl); +const char *sample_src_names(unsigned int use); +const char *sample_ckp_names(unsigned int use); +struct sample_fetch *find_sample_fetch(const char *kw, int len); +void smp_dump_fetch_kw(void); +void smp_dump_conv_kw(void); +struct sample_fetch *sample_fetch_getnext(struct sample_fetch *current, int *idx); +struct sample_conv *sample_conv_getnext(struct sample_conv *current, int *idx); +int smp_resolve_args(struct proxy *p, char **err); +int smp_check_date_unit(struct arg *args, char **err); +int smp_expr_output_type(struct sample_expr *expr); +int c_none(struct sample *smp); +int c_pseudo(struct sample *smp); +int smp_dup(struct sample *smp); + +/* + * This function just apply a cast on sample. It returns 0 if the cast is not + * available or if the cast fails, otherwise returns 1. It does not modify the + * input sample on failure. + */ +static inline +int sample_convert(struct sample *sample, int req_type) +{ + if (!sample_casts[sample->data.type][req_type]) + return 0; + if (sample_casts[sample->data.type][req_type] == c_none) + return 1; + return sample_casts[sample->data.type][req_type](sample); +} + +static inline +struct sample *smp_set_owner(struct sample *smp, struct proxy *px, + struct session *sess, struct stream *strm, int opt) +{ + smp->px = px; + smp->sess = sess; + smp->strm = strm; + smp->opt = opt; + return smp; +} + + +/* Returns 1 if a sample may be safely used. It performs a few checks on the + * string length versus size, same for the binary version, and ensures that + * strings are properly terminated by a zero. If this last point is not granted + * but the string is not const, then the \0 is appended. Otherwise it returns 0, + * meaning the caller may need to call smp_dup() before going further. + */ +static inline +int smp_is_safe(struct sample *smp) +{ + switch (smp->data.type) { + case SMP_T_METH: + if (smp->data.u.meth.meth != HTTP_METH_OTHER) + return 1; + __fallthrough; + + case SMP_T_STR: + if (!smp->data.u.str.size || smp->data.u.str.data >= smp->data.u.str.size) + return 0; + + if (smp->data.u.str.area[smp->data.u.str.data] == 0) + return 1; + + if (smp->flags & SMP_F_CONST) + return 0; + + smp->data.u.str.area[smp->data.u.str.data] = 0; + return 1; + + case SMP_T_BIN: + return !smp->data.u.str.size || smp->data.u.str.data <= smp->data.u.str.size; + + default: + return 1; + } +} + +/* checks that a sample may freely be used, or duplicates it to normalize it. + * Returns 1 on success, 0 if the sample must not be used. The function also + * checks for NULL to simplify the calling code. + */ +static inline +int smp_make_safe(struct sample *smp) +{ + return smp && (smp_is_safe(smp) || smp_dup(smp)); +} + +/* Returns 1 if a sample may be safely modified in place. It performs a few + * checks on the string length versus size, same for the binary version, and + * ensures that strings are properly terminated by a zero, and of course that + * the size is allocate and that the SMP_F_CONST flag is not set. If only the + * trailing zero is missing, it is appended. Otherwise it returns 0, meaning + * the caller may need to call smp_dup() before going further. + */ +static inline +int smp_is_rw(struct sample *smp) +{ + if (smp->flags & SMP_F_CONST) + return 0; + + switch (smp->data.type) { + case SMP_T_METH: + if (smp->data.u.meth.meth != HTTP_METH_OTHER) + return 1; + __fallthrough; + + case SMP_T_STR: + if (!smp->data.u.str.size || + smp->data.u.str.data >= smp->data.u.str.size) + return 0; + + if (smp->data.u.str.area[smp->data.u.str.data] != 0) + smp->data.u.str.area[smp->data.u.str.data] = 0; + return 1; + + case SMP_T_BIN: + return smp->data.u.str.size && + smp->data.u.str.data <= smp->data.u.str.size; + + default: + return 1; + } +} + +/* checks that a sample may freely be modified, or duplicates it to normalize + * it and make it R/W. Returns 1 on success, 0 if the sample must not be used. + * The function also checks for NULL to simplify the calling code. + */ +static inline +int smp_make_rw(struct sample *smp) +{ + return smp && (smp_is_rw(smp) || smp_dup(smp)); +} + +#endif /* _HAPROXY_SAMPLE_H */ diff --git a/include/haproxy/sample_data-t.h b/include/haproxy/sample_data-t.h new file mode 100644 index 0000000..2546028 --- /dev/null +++ b/include/haproxy/sample_data-t.h @@ -0,0 +1,51 @@ +/* + * include/haproxy/sample_data-t.h + * Definitions of sample data + * + * Copyright (C) 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2020 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SAMPLE_DATA_T_H +#define _HAPROXY_SAMPLE_DATA_T_H + +#include <sys/socket.h> +#include <netinet/in.h> +#include <haproxy/buf-t.h> +#include <haproxy/http-t.h> + +/* Note: the strings below make use of chunks. Chunks may carry an allocated + * size in addition to the length. The size counts from the beginning (str) + * to the end. If the size is unknown, it MUST be zero, in which case the + * sample will automatically be duplicated when a change larger than <len> has + * to be performed. Thus it is safe to always set size to zero. + */ +union sample_value { + long long int sint; /* used for signed 64bits integers */ + struct in_addr ipv4; /* used for ipv4 addresses */ + struct in6_addr ipv6; /* used for ipv6 addresses */ + struct buffer str; /* used for char strings or buffers */ + struct http_meth meth; /* used for http method */ +}; + +/* Used to store sample constant */ +struct sample_data { + int type; /* SMP_T_* */ + union sample_value u; /* sample data */ +}; + +#endif /* _HAPROXY_SAMPLE_DATA_T_H */ diff --git a/include/haproxy/sc_strm.h b/include/haproxy/sc_strm.h new file mode 100644 index 0000000..41f07e9 --- /dev/null +++ b/include/haproxy/sc_strm.h @@ -0,0 +1,447 @@ +/* + * include/haproxy/sc_strm.h + * This file contains stream-specific stream-connector functions prototypes + * + * Copyright 2022 Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SC_STRM_H +#define _HAPROXY_SC_STRM_H + +#include <haproxy/api.h> +#include <haproxy/buf-t.h> +#include <haproxy/channel-t.h> +#include <haproxy/stream-t.h> +#include <haproxy/task-t.h> +#include <haproxy/connection.h> +#include <haproxy/channel.h> +#include <haproxy/session.h> +#include <haproxy/stconn.h> +#include <haproxy/stream.h> + +void sc_update_rx(struct stconn *sc); +void sc_update_tx(struct stconn *sc); + +struct task *sc_conn_io_cb(struct task *t, void *ctx, unsigned int state); +int sc_conn_sync_recv(struct stconn *sc); +void sc_conn_sync_send(struct stconn *sc); + + +/* returns the channel which receives data from this stream connector (input channel) */ +static inline struct channel *sc_ic(const struct stconn *sc) +{ + struct stream *strm = __sc_strm(sc); + + return ((sc->flags & SC_FL_ISBACK) ? &(strm->res) : &(strm->req)); +} + +/* returns the channel which feeds data to this stream connector (output channel) */ +static inline struct channel *sc_oc(const struct stconn *sc) +{ + struct stream *strm = __sc_strm(sc); + + return ((sc->flags & SC_FL_ISBACK) ? &(strm->req) : &(strm->res)); +} + +/* returns the buffer which receives data from this stream connector (input channel's buffer) */ +static inline struct buffer *sc_ib(const struct stconn *sc) +{ + return &sc_ic(sc)->buf; +} + +/* returns the buffer which feeds data to this stream connector (output channel's buffer) */ +static inline struct buffer *sc_ob(const struct stconn *sc) +{ + return &sc_oc(sc)->buf; +} +/* returns the stream's task associated to this stream connector */ +static inline struct task *sc_strm_task(const struct stconn *sc) +{ + struct stream *strm = __sc_strm(sc); + + return strm->task; +} + +/* returns the stream connector on the other side. Used during forwarding. */ +static inline struct stconn *sc_opposite(const struct stconn *sc) +{ + struct stream *strm = __sc_strm(sc); + + return ((sc->flags & SC_FL_ISBACK) ? strm->scf : strm->scb); +} + + +/* sets the current and previous state of a stream connector to <state>. This is + * mainly used to create one in the established state on incoming connections. + */ +static inline void sc_set_state(struct stconn *sc, int state) +{ + sc->state = __sc_strm(sc)->prev_conn_state = state; +} + +/* returns a bit for a stream connector state, to match against SC_SB_* */ +static inline enum sc_state_bit sc_state_bit(enum sc_state state) +{ + BUG_ON(state > SC_ST_CLO); + return 1U << state; +} + +/* returns true if <state> matches one of the SC_SB_* bits in <mask> */ +static inline int sc_state_in(enum sc_state state, enum sc_state_bit mask) +{ + BUG_ON(mask & ~SC_SB_ALL); + return !!(sc_state_bit(state) & mask); +} + +/* Returns true if a connection is attached to the stream connector <sc> and if this + * connection is ready. + */ +static inline int sc_conn_ready(const struct stconn *sc) +{ + const struct connection *conn = sc_conn(sc); + + return conn && conn_ctrl_ready(conn) && conn_xprt_ready(conn); +} + + +/* The stream connector is only responsible for the connection during the early + * states, before plugging a mux. Thus it should only care about CO_FL_ERROR + * before SC_ST_EST, and after that it must absolutely ignore it since the mux + * may hold pending data. This function returns true if such an error was + * reported. Both the SC and the CONN must be valid. + */ +static inline int sc_is_conn_error(const struct stconn *sc) +{ + const struct connection *conn; + + if (sc->state >= SC_ST_EST) + return 0; + + conn = __sc_conn(sc); + BUG_ON(!conn); + return !!(conn->flags & CO_FL_ERROR); +} + +/* Try to allocate a buffer for the stream connector's input channel. It relies on + * channel_alloc_buffer() for this so it abides by its rules. It returns 0 on + * failure, non-zero otherwise. If no buffer is available, the requester, + * represented by the <wait> pointer, will be added in the list of objects + * waiting for an available buffer, and SC_FL_NEED_BUFF will be set on the + * stream connector and SE_FL_HAVE_NO_DATA cleared. The requester will be responsible + * for calling this function to try again once woken up. + */ +static inline int sc_alloc_ibuf(struct stconn *sc, struct buffer_wait *wait) +{ + int ret; + + ret = channel_alloc_buffer(sc_ic(sc), wait); + if (!ret) + sc_need_buff(sc); + return ret; +} + + +/* Returns the source address of the stream connector and, if not set, fallbacks on + * the session for frontend SC and the server connection for the backend SC. It + * returns a const address on success or NULL on failure. + */ +static inline const struct sockaddr_storage *sc_src(const struct stconn *sc) +{ + if (sc->src) + return sc->src; + if (!(sc->flags & SC_FL_ISBACK)) + return sess_src(strm_sess(__sc_strm(sc))); + else { + struct connection *conn = sc_conn(sc); + + if (conn) + return conn_src(conn); + } + return NULL; +} + + +/* Returns the destination address of the stream connector and, if not set, fallbacks + * on the session for frontend SC and the server connection for the backend + * SC. It returns a const address on success or NULL on failure. + */ +static inline const struct sockaddr_storage *sc_dst(const struct stconn *sc) +{ + if (sc->dst) + return sc->dst; + if (!(sc->flags & SC_FL_ISBACK)) + return sess_dst(strm_sess(__sc_strm(sc))); + else { + struct connection *conn = sc_conn(sc); + + if (conn) + return conn_dst(conn); + } + return NULL; +} + +/* Retrieves the source address of the stream connector. Returns non-zero on success + * or zero on failure. The operation is only performed once and the address is + * stored in the stream connector for future use. On the first call, the stream connector + * source address is copied from the session one for frontend SC and the server + * connection for the backend SC. + */ +static inline int sc_get_src(struct stconn *sc) +{ + const struct sockaddr_storage *src = NULL; + + if (sc->src) + return 1; + + if (!(sc->flags & SC_FL_ISBACK)) + src = sess_src(strm_sess(__sc_strm(sc))); + else { + struct connection *conn = sc_conn(sc); + + if (conn) + src = conn_src(conn); + } + if (!src) + return 0; + + if (!sockaddr_alloc(&sc->src, src, sizeof(*src))) + return 0; + + return 1; +} + +/* Retrieves the destination address of the stream connector. Returns non-zero on + * success or zero on failure. The operation is only performed once and the + * address is stored in the stream connector for future use. On the first call, the + * stream connector destination address is copied from the session one for frontend + * SC and the server connection for the backend SC. + */ +static inline int sc_get_dst(struct stconn *sc) +{ + const struct sockaddr_storage *dst = NULL; + + if (sc->dst) + return 1; + + if (!(sc->flags & SC_FL_ISBACK)) + dst = sess_dst(strm_sess(__sc_strm(sc))); + else { + struct connection *conn = sc_conn(sc); + + if (conn) + dst = conn_dst(conn); + } + if (!dst) + return 0; + + if (!sockaddr_alloc(&sc->dst, dst, sizeof(*dst))) + return 0; + + return 1; +} + + +/* Marks on the stream connector that next shutdown must kill the whole connection */ +static inline void sc_must_kill_conn(struct stconn *sc) +{ + sc_ep_set(sc, SE_FL_KILL_CONN); +} + + +/* Returns non-zero if the stream connector is allowed to receive from the + * endpoint, which means that no flag indicating a blocked channel, lack of + * buffer or room is set, and that the endpoint is not waiting for the + * application to complete a connection setup on the other side, and that + * the stream's channel is not shut for reads. This is only used by stream + * applications. + */ +__attribute__((warn_unused_result)) +static inline int sc_is_recv_allowed(const struct stconn *sc) +{ + if (sc->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) + return 0; + + if (sc_ep_test(sc, SE_FL_APPLET_NEED_CONN)) + return 0; + + if (sc_ep_test(sc, SE_FL_HAVE_NO_DATA)) + return 0; + + if (sc_ep_test(sc, SE_FL_MAY_FASTFWD_PROD) && (sc_opposite(sc)->sedesc->iobuf.flags & IOBUF_FL_FF_BLOCKED)) + return 0; + + return !(sc->flags & (SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)); +} + +/* This is to be used after making some room available in a channel. It will + * return without doing anything if the stream connector's RX path is blocked. + * It will automatically mark the stream connector as busy processing the end + * point in order to avoid useless repeated wakeups. + * It will then call ->chk_rcv() to enable receipt of new data. + */ +static inline void sc_chk_rcv(struct stconn *sc) +{ + if (sc_ep_test(sc, SE_FL_APPLET_NEED_CONN) && + sc_state_in(sc_opposite(sc)->state, SC_SB_RDY|SC_SB_EST|SC_SB_DIS|SC_SB_CLO)) { + sc_ep_clr(sc, SE_FL_APPLET_NEED_CONN); + sc_ep_report_read_activity(sc); + } + + if (!sc_is_recv_allowed(sc)) + return; + + if (!sc_state_in(sc->state, SC_SB_RDY|SC_SB_EST)) + return; + + sc_ep_set(sc, SE_FL_HAVE_NO_DATA); + if (likely(sc->app_ops->chk_rcv)) + sc->app_ops->chk_rcv(sc); +} + +/* Calls chk_snd on the endpoint using the data layer */ +static inline void sc_chk_snd(struct stconn *sc) +{ + if (likely(sc->app_ops->chk_snd)) + sc->app_ops->chk_snd(sc); +} + +/* Combines both sc_update_rx() and sc_update_tx() at once */ +static inline void sc_update(struct stconn *sc) +{ + sc_update_rx(sc); + sc_update_tx(sc); +} + +/* for debugging, reports the stream connector state name */ +static inline const char *sc_state_str(int state) +{ + switch (state) { + case SC_ST_INI: return "INI"; + case SC_ST_REQ: return "REQ"; + case SC_ST_QUE: return "QUE"; + case SC_ST_TAR: return "TAR"; + case SC_ST_ASS: return "ASS"; + case SC_ST_CON: return "CON"; + case SC_ST_CER: return "CER"; + case SC_ST_RDY: return "RDY"; + case SC_ST_EST: return "EST"; + case SC_ST_DIS: return "DIS"; + case SC_ST_CLO: return "CLO"; + default: return "???"; + } +} + +/* indicates if the connector may send data to the endpoint, that is, the + * endpoint is both willing to receive data and ready to do so. This is only + * used with applets so there's always a stream attached to this connector. + */ +__attribute__((warn_unused_result)) +static inline int sc_is_send_allowed(const struct stconn *sc) +{ + if (sc->flags & SC_FL_SHUT_DONE) + return 0; + + return !sc_ep_test(sc, SE_FL_WAIT_DATA | SE_FL_WONT_CONSUME); +} + +static inline int sc_rcv_may_expire(const struct stconn *sc) +{ + if ((sc->flags & (SC_FL_ABRT_DONE|SC_FL_EOS)) || (sc_ic(sc)->flags & CF_READ_TIMEOUT)) + return 0; + if (sc->flags & (SC_FL_EOI|SC_FL_WONT_READ|SC_FL_NEED_BUFF|SC_FL_NEED_ROOM)) + return 0; + if (sc_ep_test(sc, SE_FL_APPLET_NEED_CONN) || sc_ep_test(sc_opposite(sc), SE_FL_EXP_NO_DATA)) + return 0; + return 1; +} + +static inline int sc_snd_may_expire(const struct stconn *sc) +{ + if ((sc->flags & SC_FL_SHUT_DONE) || (sc_oc(sc)->flags & CF_WRITE_TIMEOUT)) + return 0; + if (sc_ep_test(sc, SE_FL_WONT_CONSUME)) + return 0; + return 1; +} + +static forceinline int sc_ep_rcv_ex(const struct stconn *sc) +{ + return ((tick_isset(sc->sedesc->lra) && sc_rcv_may_expire(sc)) + ? tick_add_ifset(sc->sedesc->lra, sc->ioto) + : TICK_ETERNITY); +} + +static forceinline int sc_ep_snd_ex(const struct stconn *sc) +{ + return ((tick_isset(sc->sedesc->fsb) && sc_snd_may_expire(sc)) + ? tick_add_ifset(sc->sedesc->fsb, sc->ioto) + : TICK_ETERNITY); +} + +static inline void sc_check_timeouts(const struct stconn *sc) +{ + if (unlikely(tick_is_expired(sc_ep_rcv_ex(sc), now_ms))) + sc_ic(sc)->flags |= CF_READ_TIMEOUT; + if (unlikely(tick_is_expired(sc_ep_snd_ex(sc), now_ms))) + sc_oc(sc)->flags |= CF_WRITE_TIMEOUT; +} + +static inline void sc_set_hcto(struct stconn *sc) +{ + struct stream *strm = __sc_strm(sc); + + if (IS_HTX_STRM(strm)) + return; + + if (sc->flags & SC_FL_ISBACK) { + if ((strm->flags & SF_BE_ASSIGNED) && tick_isset(strm->be->timeout.serverfin)) + sc->ioto = strm->be->timeout.serverfin; + } + else { + if (tick_isset(strm_fe(strm)->timeout.clientfin)) + sc->ioto = strm_fe(strm)->timeout.clientfin; + } + +} + +/* Schedule an abort for the SC */ +static inline void sc_schedule_abort(struct stconn *sc) +{ + sc->flags |= SC_FL_ABRT_WANTED; +} + +/* Abort the SC and notify the endpoint using the data layer */ +static inline void sc_abort(struct stconn *sc) +{ + if (likely(sc->app_ops->abort)) + sc->app_ops->abort(sc); +} + +/* Schedule a shutdown for the SC */ +static inline void sc_schedule_shutdown(struct stconn *sc) +{ + sc->flags |= SC_FL_SHUT_WANTED; +} + +/* Shutdown the SC and notify the endpoint using the data layer */ +static inline void sc_shutdown(struct stconn *sc) +{ + if (likely(sc->app_ops->shutdown)) + sc->app_ops->shutdown(sc); +} + +#endif /* _HAPROXY_SC_STRM_H */ diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h new file mode 100644 index 0000000..666d2cc --- /dev/null +++ b/include/haproxy/server-t.h @@ -0,0 +1,681 @@ +/* + * include/haproxy/server-t.h + * This file defines everything related to servers. + * + * Copyright (C) 2000-2012 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SERVER_T_H +#define _HAPROXY_SERVER_T_H + +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/check-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/counters-t.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/listener-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/queue-t.h> +#include <haproxy/quic_tp-t.h> +#include <haproxy/resolvers-t.h> +#include <haproxy/stats-t.h> +#include <haproxy/task-t.h> +#include <haproxy/thread-t.h> +#include <haproxy/event_hdl-t.h> +#include <haproxy/tools-t.h> + + +/* server states. Only SRV_ST_STOPPED indicates a down server. */ +enum srv_state { + SRV_ST_STOPPED = 0, /* the server is down. Please keep set to zero. */ + SRV_ST_STARTING, /* the server is warming up (up but throttled) */ + SRV_ST_RUNNING, /* the server is fully up */ + SRV_ST_STOPPING, /* the server is up but soft-stopping (eg: 404) */ +} __attribute__((packed)); + +/* Administrative status : a server runs in one of these 3 stats : + * - READY : normal mode + * - DRAIN : takes no new visitor, equivalent to weight == 0 + * - MAINT : maintenance mode, no more traffic nor health checks. + * + * Each server may be in maintenance by itself or may inherit this status from + * another server it tracks. It can also be in drain mode by itself or inherit + * it from another server. Let's store these origins here as flags. These flags + * are combined this way : + * + * FMAINT IMAINT FDRAIN IDRAIN Resulting state + * 0 0 0 0 READY + * 0 0 0 1 DRAIN + * 0 0 1 x DRAIN + * 0 1 x x MAINT + * 1 x x x MAINT + * + * This can be simplified this way : + * + * state_str = (state & MAINT) ? "MAINT" : (state & DRAIN) : "DRAIN" : "READY" + */ +enum srv_admin { + SRV_ADMF_FMAINT = 0x01, /* the server was explicitly forced into maintenance */ + SRV_ADMF_IMAINT = 0x02, /* the server has inherited the maintenance status from a tracked server */ + SRV_ADMF_MAINT = 0x23, /* mask to check if any maintenance flag is present */ + SRV_ADMF_CMAINT = 0x04, /* the server is in maintenance because of the configuration */ + SRV_ADMF_FDRAIN = 0x08, /* the server was explicitly forced into drain state */ + SRV_ADMF_IDRAIN = 0x10, /* the server has inherited the drain status from a tracked server */ + SRV_ADMF_DRAIN = 0x18, /* mask to check if any drain flag is present */ + SRV_ADMF_RMAINT = 0x20, /* the server is down because of an IP address resolution failure */ + SRV_ADMF_HMAINT = 0x40, /* the server FQDN has been set from socket stats */ +} __attribute__((packed)); + +/* options for servers' "init-addr" parameter + * this parameter may be used to drive HAProxy's behavior when parsing a server + * address at start up time. + * These values are stored as a list into an integer ordered from first to last + * starting with the lowest to highest bits. SRV_IADDR_END (0) is used to + * indicate the end of the list. 3 bits are enough to store each value. + */ +enum srv_initaddr { + SRV_IADDR_END = 0, /* end of the list */ + SRV_IADDR_NONE = 1, /* the server won't have any address at start up */ + SRV_IADDR_LIBC = 2, /* address set using the libc DNS resolver */ + SRV_IADDR_LAST = 3, /* we set the IP address found in state-file for this server */ + SRV_IADDR_IP = 4, /* we set an arbitrary IP address to the server */ +} __attribute__((packed)); + +/* server-state-file version */ +#define SRV_STATE_FILE_VERSION 1 +#define SRV_STATE_FILE_VERSION_MIN 1 +#define SRV_STATE_FILE_VERSION_MAX 1 +#define SRV_STATE_FILE_FIELD_NAMES \ + "be_id " \ + "be_name " \ + "srv_id " \ + "srv_name " \ + "srv_addr " \ + "srv_op_state " \ + "srv_admin_state " \ + "srv_uweight " \ + "srv_iweight " \ + "srv_time_since_last_change " \ + "srv_check_status " \ + "srv_check_result " \ + "srv_check_health " \ + "srv_check_state " \ + "srv_agent_state " \ + "bk_f_forced_id " \ + "srv_f_forced_id " \ + "srv_fqdn " \ + "srv_port " \ + "srvrecord " \ + "srv_use_ssl " \ + "srv_check_port " \ + "srv_check_addr " \ + "srv_agent_addr " \ + "srv_agent_port" + +#define SRV_STATE_FILE_MAX_FIELDS 25 +#define SRV_STATE_FILE_MIN_FIELDS_VERSION_1 20 +#define SRV_STATE_FILE_MAX_FIELDS_VERSION_1 25 +#define SRV_STATE_LINE_MAXLEN 2000 + +/* server flags -- 32 bits */ +#define SRV_F_BACKUP 0x0001 /* this server is a backup server */ +#define SRV_F_MAPPORTS 0x0002 /* this server uses mapped ports */ +#define SRV_F_NON_STICK 0x0004 /* never add connections allocated to this server to a stick table */ +#define SRV_F_USE_NS_FROM_PP 0x0008 /* use namespace associated with connection if present */ +#define SRV_F_FORCED_ID 0x0010 /* server's ID was forced in the configuration */ +#define SRV_F_RHTTP 0x0020 /* reverse HTTP server which requires idle connection for transfers */ +#define SRV_F_AGENTPORT 0x0040 /* this server has a agent port configured */ +#define SRV_F_AGENTADDR 0x0080 /* this server has a agent addr configured */ +#define SRV_F_COOKIESET 0x0100 /* this server has a cookie configured, so don't generate dynamic cookies */ +#define SRV_F_FASTOPEN 0x0200 /* Use TCP Fast Open to connect to server */ +#define SRV_F_SOCKS4_PROXY 0x0400 /* this server uses SOCKS4 proxy */ +#define SRV_F_NO_RESOLUTION 0x0800 /* disable runtime DNS resolution on this server */ +#define SRV_F_DYNAMIC 0x1000 /* dynamic server instantiated at runtime */ +#define SRV_F_NON_PURGEABLE 0x2000 /* this server cannot be removed at runtime */ +#define SRV_F_DEFSRV_USE_SSL 0x4000 /* default-server uses SSL */ +#define SRV_F_DELETED 0x8000 /* srv is deleted but not yet purged */ + +/* configured server options for send-proxy (server->pp_opts) */ +#define SRV_PP_V1 0x0001 /* proxy protocol version 1 */ +#define SRV_PP_V2 0x0002 /* proxy protocol version 2 */ +#define SRV_PP_V2_SSL 0x0004 /* proxy protocol version 2 with SSL */ +#define SRV_PP_V2_SSL_CN 0x0008 /* proxy protocol version 2 with CN */ +#define SRV_PP_V2_SSL_KEY_ALG 0x0010 /* proxy protocol version 2 with cert key algorithm */ +#define SRV_PP_V2_SSL_SIG_ALG 0x0020 /* proxy protocol version 2 with cert signature algorithm */ +#define SRV_PP_V2_SSL_CIPHER 0x0040 /* proxy protocol version 2 with cipher used */ +#define SRV_PP_V2_AUTHORITY 0x0080 /* proxy protocol version 2 with authority */ +#define SRV_PP_V2_CRC32C 0x0100 /* proxy protocol version 2 with crc32c */ +#define SRV_PP_V2_UNIQUE_ID 0x0200 /* proxy protocol version 2 with unique ID */ + +/* function which act on servers need to return various errors */ +#define SRV_STATUS_OK 0 /* everything is OK. */ +#define SRV_STATUS_INTERNAL 1 /* other unrecoverable errors. */ +#define SRV_STATUS_NOSRV 2 /* no server is available */ +#define SRV_STATUS_FULL 3 /* the/all server(s) are saturated */ +#define SRV_STATUS_QUEUED 4 /* the/all server(s) are saturated but the connection was queued */ + +/* various constants */ +#define SRV_UWGHT_RANGE 256 +#define SRV_UWGHT_MAX (SRV_UWGHT_RANGE) +#define SRV_EWGHT_RANGE (SRV_UWGHT_RANGE * BE_WEIGHT_SCALE) +#define SRV_EWGHT_MAX (SRV_UWGHT_MAX * BE_WEIGHT_SCALE) + +/* server ssl options */ +#define SRV_SSL_O_NONE 0x0000 +#define SRV_SSL_O_NO_TLS_TICKETS 0x0100 /* disable session resumption tickets */ +#define SRV_SSL_O_NO_REUSE 0x200 /* disable session reuse */ +#define SRV_SSL_O_EARLY_DATA 0x400 /* Allow using early data */ + +/* log servers ring's protocols options */ +enum srv_log_proto { + SRV_LOG_PROTO_LEGACY, // messages on TCP separated by LF + SRV_LOG_PROTO_OCTET_COUNTING, // TCP frames: MSGLEN SP MSG +}; + +/* srv administrative change causes */ +enum srv_adm_st_chg_cause { + SRV_ADM_STCHGC_NONE = 0, + SRV_ADM_STCHGC_DNS_NOENT, /* entry removed from srv record */ + SRV_ADM_STCHGC_DNS_NOIP, /* no server ip in the srv record */ + SRV_ADM_STCHGC_DNS_NX, /* resolution spent too much time in NX state */ + SRV_ADM_STCHGC_DNS_TIMEOUT, /* resolution timeout */ + SRV_ADM_STCHGC_DNS_REFUSED, /* query refused by dns server */ + SRV_ADM_STCHGC_DNS_UNSPEC, /* unspecified dns error */ + SRV_ADM_STCHGC_STATS_DISABLE, /* legacy disable from the stats */ + SRV_ADM_STCHGC_STATS_STOP /* legacy stop from the stats */ +}; + +/* srv operational change causes */ +enum srv_op_st_chg_cause { + SRV_OP_STCHGC_NONE = 0, + SRV_OP_STCHGC_HEALTH, /* changed from a health check */ + SRV_OP_STCHGC_AGENT, /* changed from an agent check */ + SRV_OP_STCHGC_CLI, /* changed from the cli */ + SRV_OP_STCHGC_LUA, /* changed from lua */ + SRV_OP_STCHGC_STATS_WEB, /* changed from the web interface */ + SRV_OP_STCHGC_STATEFILE /* changed from state file */ +}; + +struct pid_list { + struct list list; + pid_t pid; + struct task *t; + int status; + int exited; +}; + +/* A tree occurrence is a descriptor of a place in a tree, with a pointer back + * to the server itself. + */ +struct server; +struct tree_occ { + struct server *server; + struct eb32_node node; +}; + +/* Each server will have one occurrence of this structure per thread */ +struct srv_per_thread { + struct mt_list streams; /* streams using this server (used by "shutdown server sessions") */ + struct eb_root idle_conns; /* Shareable idle connections */ + struct eb_root safe_conns; /* Safe idle connections */ + struct eb_root avail_conns; /* Connections in use, but with still new streams available */ + + /* Secondary idle conn storage used in parallel to idle/safe trees. + * Used to sort them by last usage and purge them in reverse order. + */ + struct list idle_conn_list; +}; + +/* Each server will have one occurrence of this structure per thread group */ +struct srv_per_tgroup { + unsigned int next_takeover; /* thread ID to try to steal connections from next time */ +}; + +/* Configure the protocol selection for websocket */ +enum __attribute__((__packed__)) srv_ws_mode { + SRV_WS_AUTO = 0, + SRV_WS_H1, + SRV_WS_H2, +}; + +/* Server-side TLV list, contains the types of the TLVs that should be sent out. + * Additionally, it can contain a format string, if specified in the config. + */ +struct srv_pp_tlv_list { + struct list list; + struct list fmt; + char *fmt_string; + unsigned char type; +}; + +struct proxy; +struct server { + /* mostly config or admin stuff, doesn't change often */ + enum obj_type obj_type; /* object type == OBJ_TYPE_SERVER */ + enum srv_state next_state, cur_state; /* server state among SRV_ST_* */ + enum srv_admin next_admin, cur_admin; /* server maintenance status : SRV_ADMF_* */ + signed char use_ssl; /* ssl enabled (1: on, 0: disabled, -1 forced off) */ + unsigned int flags; /* server flags (SRV_F_*) */ + unsigned int pp_opts; /* proxy protocol options (SRV_PP_*) */ + struct list global_list; /* attach point in the global servers_list */ + struct server *next; + struct mt_list prev_deleted; /* deleted servers with 'next' ptr pointing to us */ + int cklen; /* the len of the cookie, to speed up checks */ + int rdr_len; /* the length of the redirection prefix */ + char *cookie; /* the id set in the cookie */ + char *rdr_pfx; /* the redirection prefix */ + + struct proxy *proxy; /* the proxy this server belongs to */ + const struct mux_proto_list *mux_proto; /* the mux to use for all outgoing connections (specified by the "proto" keyword) */ + struct net_addr_type addr_type; /* server address type (socket and transport hints) */ + struct log_target *log_target; /* when 'mode log' is enabled, target facility used to transport log messages */ + unsigned maxconn, minconn; /* max # of active sessions (0 = unlimited), min# for dynamic limit. */ + struct srv_per_thread *per_thr; /* array of per-thread stuff such as connections lists */ + struct srv_per_tgroup *per_tgrp; /* array of per-tgroup stuff such as idle conns */ + unsigned int *curr_idle_thr; /* Current number of orphan idling connections per thread */ + + unsigned int pool_purge_delay; /* Delay before starting to purge the idle conns pool */ + unsigned int low_idle_conns; /* min idle connection count to start picking from other threads */ + unsigned int max_idle_conns; /* Max number of connection allowed in the orphan connections list */ + int max_reuse; /* Max number of requests on a same connection */ + struct task *warmup; /* the task dedicated to the warmup when slowstart is set */ + + struct server *track; /* the server we're currently tracking, if any */ + struct server *trackers; /* the list of servers tracking us, if any */ + struct server *tracknext; /* next server tracking <track> in <track>'s trackers list */ + char *trackit; /* temporary variable to make assignment deferrable */ + int consecutive_errors_limit; /* number of consecutive errors that triggers an event */ + short observe, onerror; /* observing mode: one of HANA_OBS_*; what to do on error: on of ANA_ONERR_* */ + short onmarkeddown; /* what to do when marked down: one of HANA_ONMARKEDDOWN_* */ + short onmarkedup; /* what to do when marked up: one of HANA_ONMARKEDUP_* */ + int slowstart; /* slowstart time in seconds (ms in the conf) */ + + char *id; /* just for identification */ + uint32_t rid; /* revision: if id has been reused for a new server, rid won't match */ + unsigned iweight,uweight, cur_eweight; /* initial weight, user-specified weight, and effective weight */ + unsigned wscore; /* weight score, used during srv map computation */ + unsigned next_eweight; /* next pending eweight to commit */ + unsigned rweight; /* remainder of weight in the current LB tree */ + unsigned cumulative_weight; /* weight of servers prior to this one in the same group, for chash balancing */ + int maxqueue; /* maximum number of pending connections allowed */ + int shard; /* shard (in peers protocol context only) */ + int log_bufsize; /* implicit ring bufsize (for log server only - in log backend) */ + + enum srv_ws_mode ws; /* configure the protocol selection for websocket */ + /* 3 bytes hole here */ + + uint refcount; /* refcount used to remove a server at runtime */ + + /* The elements below may be changed on every single request by any + * thread, and generally at the same time. + */ + THREAD_PAD(63); + struct eb32_node idle_node; /* When to next do cleanup in the idle connections */ + unsigned int curr_idle_conns; /* Current number of orphan idling connections, both the idle and the safe lists */ + unsigned int curr_idle_nb; /* Current number of connections in the idle list */ + unsigned int curr_safe_nb; /* Current number of connections in the safe list */ + unsigned int curr_used_conns; /* Current number of used connections */ + unsigned int max_used_conns; /* Max number of used connections (the counter is reset at each connection purges */ + unsigned int est_need_conns; /* Estimate on the number of needed connections (max of curr and previous max_used) */ + + struct queue queue; /* pending connections */ + + /* Element below are usd by LB algorithms and must be doable in + * parallel to other threads reusing connections above. + */ + THREAD_PAD(63); + __decl_thread(HA_SPINLOCK_T lock); /* may enclose the proxy's lock, must not be taken under */ + unsigned npos, lpos; /* next and last positions in the LB tree, protected by LB lock */ + union { + struct eb32_node lb_node; /* node used for tree-based load balancing */ + struct list lb_list; /* elem used for list-based load balancing */ + }; + struct server *next_full; /* next server in the temporary full list */ + + /* usually atomically updated by any thread during parsing or on end of request */ + THREAD_PAD(63); + int cur_sess; /* number of currently active sessions (including syn_sent) */ + int served; /* # of active sessions currently being served (ie not pending) */ + int consecutive_errors; /* current number of consecutive errors */ + struct freq_ctr sess_per_sec; /* sessions per second on this server */ + struct be_counters counters; /* statistics counters */ + + /* Below are some relatively stable settings, only changed under the lock */ + THREAD_PAD(63); + + struct eb_root *lb_tree; /* we want to know in what tree the server is */ + struct tree_occ *lb_nodes; /* lb_nodes_tot * struct tree_occ */ + unsigned lb_nodes_tot; /* number of allocated lb_nodes (C-HASH) */ + unsigned lb_nodes_now; /* number of lb_nodes placed in the tree (C-HASH) */ + + const struct netns_entry *netns; /* contains network namespace name or NULL. Network namespace comes from configuration */ + struct xprt_ops *xprt; /* transport-layer operations */ + unsigned int svc_port; /* the port to connect to (for relevant families) */ + unsigned down_time; /* total time the server was down */ + time_t last_change; /* last time, when the state was changed */ + + int puid; /* proxy-unique server ID, used for SNMP, and "first" LB algo */ + int tcp_ut; /* for TCP, user timeout */ + + int do_check; /* temporary variable used during parsing to denote if health checks must be enabled */ + int do_agent; /* temporary variable used during parsing to denote if an auxiliary agent check must be enabled */ + struct check check; /* health-check specific configuration */ + struct check agent; /* agent specific configuration */ + + struct resolv_requester *resolv_requester; /* used to link a server to its DNS resolution */ + char *resolvers_id; /* resolvers section used by this server */ + struct resolvers *resolvers; /* pointer to the resolvers structure used by this server */ + char *lastaddr; /* the address string provided by the server-state file */ + struct resolv_options resolv_opts; + int hostname_dn_len; /* string length of the server hostname in Domain Name format */ + char *hostname_dn; /* server hostname in Domain Name format */ + char *hostname; /* server hostname */ + struct sockaddr_storage init_addr; /* plain IP address specified on the init-addr line */ + unsigned int init_addr_methods; /* initial address setting, 3-bit per method, ends at 0, enough to store 10 entries */ + enum srv_log_proto log_proto; /* used proto to emit messages on server lines from log or ring section */ + + char *sni_expr; /* Temporary variable to store a sample expression for SNI */ + struct { + void *ctx; + struct { + /* ptr/size may be shared R/O with other threads under read lock + * "sess_lock", however only the owning thread may change them + * (under write lock). + */ + unsigned char *ptr; + int size; + int allocated_size; + char *sni; /* SNI used for the session */ + __decl_thread(HA_RWLOCK_T sess_lock); + } * reused_sess; + uint last_ssl_sess_tid; /* last tid+1 having updated reused_sess (0=none, >0=tid+1) */ + + struct ckch_inst *inst; /* Instance of the ckch_store in which the certificate was loaded (might be null if server has no certificate) */ + __decl_thread(HA_RWLOCK_T lock); /* lock the cache and SSL_CTX during commit operations */ + + char *ciphers; /* cipher suite to use if non-null */ + char *ciphersuites; /* TLS 1.3 cipher suite to use if non-null */ + char *curves; /* TLS curves list */ + int options; /* ssl options */ + int verify; /* verify method (set of SSL_VERIFY_* flags) */ + struct tls_version_filter methods; /* ssl methods */ + char *verify_host; /* hostname of certificate must match this host */ + char *ca_file; /* CAfile to use on verify */ + char *crl_file; /* CRLfile to use on verify */ + char *client_crt; /* client certificate to send */ + char *sigalgs; /* Signature algorithms */ + char *client_sigalgs; /* Client Signature algorithms */ + struct sample_expr *sni; /* sample expression for SNI */ + char *npn_str; /* NPN protocol string */ + int npn_len; /* NPN protocol string length */ + char *alpn_str; /* ALPN protocol string */ + int alpn_len; /* ALPN protocol string length */ + } ssl_ctx; + struct resolv_srvrq *srvrq; /* Pointer representing the DNS SRV requeest, if any */ + struct list srv_rec_item; /* to attach server to a srv record item */ + struct list ip_rec_item; /* to attach server to a A or AAAA record item */ + struct ebpt_node host_dn; /* hostdn store for srvrq and state file matching*/ + struct list pp_tlvs; /* to send out PROXY protocol v2 TLVs */ + struct task *srvrq_check; /* Task testing SRV record expiration date for this server */ + struct { + const char *file; /* file where the section appears */ + struct eb32_node id; /* place in the tree of used IDs */ + struct ebpt_node name; /* place in the tree of used names */ + int line; /* line where the section appears */ + } conf; /* config information */ + struct ebpt_node addr_node; /* Node for string representation of address for the server (including port number) */ + /* Template information used only for server objects which + * serve as template filled at parsing time and used during + * server allocations from server templates. + */ + struct { + char *prefix; + int nb_low; + int nb_high; + } tmpl_info; + + event_hdl_sub_list e_subs; /* event_hdl: server's subscribers list (atomically updated) */ + + /* warning, these structs are huge, keep them at the bottom */ + struct conn_src conn_src; /* connection source settings */ + struct sockaddr_storage addr; /* the address to connect to, doesn't include the port */ + struct sockaddr_storage socks4_addr; /* the address of the SOCKS4 Proxy, including the port */ + + EXTRA_COUNTERS(extra_counters); +}; + +/* data provided to EVENT_HDL_SUB_SERVER handlers through event_hdl facility */ +struct event_hdl_cb_data_server { + /* provided by: + * EVENT_HDL_SUB_SERVER_ADD + * EVENT_HDL_SUB_SERVER_DEL + * EVENT_HDL_SUB_SERVER_UP + * EVENT_HDL_SUB_SERVER_DOWN + * EVENT_HDL_SUB_SERVER_STATE + * EVENT_HDL_SUB_SERVER_ADMIN + * EVENT_HDL_SUB_SERVER_CHECK + * EVENT_HDL_SUB_SERVER_INETADDR + */ + struct { + /* safe data can be safely used from both + * sync and async handlers + * data consistency is guaranteed + */ + char name[64]; /* server name/id */ + char proxy_name[64]; /* id of proxy the server belongs to */ + int proxy_uuid; /* uuid of the proxy the server belongs to */ + int puid; /* proxy-unique server ID */ + uint32_t rid; /* server id revision */ + unsigned int flags; /* server flags */ + } safe; + struct { + /* unsafe data may only be used from sync handlers: + * in async mode, data consistency cannot be guaranteed + * and unsafe data may already be stale, thus using + * it is highly discouraged because it + * could lead to undefined behavior (UAF, null dereference...) + */ + struct server *ptr; /* server live ptr */ + /* lock hints */ + uint8_t thread_isolate; /* 1 = thread_isolate is on, no locking required */ + uint8_t srv_lock; /* 1 = srv lock is held */ + } unsafe; +}; + +/* check result snapshot provided through some event_hdl server events */ +struct event_hdl_cb_data_server_checkres { + uint8_t agent; /* 1 = agent check, 0 = health check */ + enum chk_result result; /* failed, passed, condpass (CHK_RES_*) */ + long duration; /* total check duration in ms */ + struct { + short status; /* check status as in check->status */ + short code; /* provided with some check statuses */ + } reason; + struct { + int cur; /* dynamic (= check->health) */ + int rise, fall; /* config dependent */ + } health; /* check's health, see check-t.h */ +}; + +/* data provided to EVENT_HDL_SUB_SERVER_STATE handlers through + * event_hdl facility + * + * Note that this may be casted to regular event_hdl_cb_data_server if + * you don't care about state related optional info + */ +struct event_hdl_cb_data_server_state { + /* provided by: + * EVENT_HDL_SUB_SERVER_STATE + */ + struct event_hdl_cb_data_server server; /* must be at the beginning */ + struct { + uint8_t type; /* 0 = operational, 1 = administrative */ + enum srv_state old_state, new_state; /* updated by both operational and admin changes */ + uint32_t requeued; /* requeued connections due to server state change */ + union { + /* state change cause: + * + * look for op_st_chg for operational state change, + * and adm_st_chg for administrative state change + */ + struct { + enum srv_op_st_chg_cause cause; + union { + /* check result is provided with + * cause == SRV_OP_STCHGC_HEALTH or cause == SRV_OP_STCHGC_AGENT + */ + struct event_hdl_cb_data_server_checkres check; + }; + } op_st_chg; + struct { + enum srv_adm_st_chg_cause cause; + } adm_st_chg; + }; + } safe; + /* no unsafe data */ +}; + +/* data provided to EVENT_HDL_SUB_SERVER_ADMIN handlers through + * event_hdl facility + * + * Note that this may be casted to regular event_hdl_cb_data_server if + * you don't care about admin related optional info + */ +struct event_hdl_cb_data_server_admin { + /* provided by: + * EVENT_HDL_SUB_SERVER_ADMIN + */ + struct event_hdl_cb_data_server server; /* must be at the beginning */ + struct { + enum srv_admin old_admin, new_admin; + uint32_t requeued; /* requeued connections due to server admin change */ + /* admin change cause */ + enum srv_adm_st_chg_cause cause; + } safe; + /* no unsafe data */ +}; + +/* data provided to EVENT_HDL_SUB_SERVER_CHECK handlers through + * event_hdl facility + * + * Note that this may be casted to regular event_hdl_cb_data_server if + * you don't care about check related optional info + */ +struct event_hdl_cb_data_server_check { + /* provided by: + * EVENT_HDL_SUB_SERVER_CHECK + */ + struct event_hdl_cb_data_server server; /* must be at the beginning */ + struct { + struct event_hdl_cb_data_server_checkres res; /* check result snapshot */ + } safe; + struct { + struct check *ptr; /* check ptr */ + } unsafe; +}; + +/* struct to store server address and port information in INET + * context + */ +struct server_inetaddr { + int family; /* AF_UNSPEC, AF_INET or AF_INET6 */ + union { + struct in_addr v4; + struct in6_addr v6; + } addr; /* may hold v4 or v6 addr */ + struct { + unsigned int svc; + uint8_t map; /* is a mapped port? (boolean) */ + } port; +}; + +/* data provided to EVENT_HDL_SUB_SERVER_INETADDR handlers through + * event_hdl facility + * + * Note that this may be casted to regular event_hdl_cb_data_server if + * you don't care about inetaddr related optional info + */ +struct event_hdl_cb_data_server_inetaddr { + /* provided by: + * EVENT_HDL_SUB_SERVER_INETADDR + */ + struct event_hdl_cb_data_server server; /* must be at the beginning */ + struct { + struct server_inetaddr prev; + struct server_inetaddr next; + uint8_t purge_conn; /* set to 1 if the network change will force a connection cleanup */ + } safe; + /* no unsafe data */ +}; + +/* Storage structure to load server-state lines from a flat file into + * an ebtree, for faster processing + */ +struct server_state_line { + char *line; + char *params[SRV_STATE_FILE_MAX_FIELDS]; + struct eb64_node node; +}; + + +/* Descriptor for a "server" keyword. The ->parse() function returns 0 in case of + * success, or a combination of ERR_* flags if an error is encountered. The + * function pointer can be NULL if not implemented. The function also has an + * access to the current "server" config line. The ->skip value tells the parser + * how many words have to be skipped after the keyword. If the function needs to + * parse more keywords, it needs to update cur_arg. + */ +struct srv_kw { + const char *kw; + int (*parse)(char **args, int *cur_arg, struct proxy *px, struct server *srv, char **err); + int skip; /* nb min of args to skip, for use when kw is not handled */ + int default_ok; /* non-zero if kw is supported in default-server section */ + int dynamic_ok; /* non-zero if kw is supported in add server cli command */ +}; + +/* + * A keyword list. It is a NULL-terminated array of keywords. It embeds a + * struct list in order to be linked to other lists, allowing it to easily + * be declared where it is needed, and linked without duplicating data nor + * allocating memory. It is also possible to indicate a scope for the keywords. + */ +struct srv_kw_list { + const char *scope; + struct list list; + struct srv_kw kw[VAR_ARRAY]; +}; + +#define SRV_PARSE_DEFAULT_SERVER 0x01 /* 'default-server' keyword */ +#define SRV_PARSE_TEMPLATE 0x02 /* 'server-template' keyword */ +#define SRV_PARSE_IN_PEER_SECTION 0x04 /* keyword in a peer section */ +#define SRV_PARSE_PARSE_ADDR 0x08 /* required to parse the server address in the second argument */ +#define SRV_PARSE_DYNAMIC 0x10 /* dynamic server created at runtime with cli */ +#define SRV_PARSE_INITIAL_RESOLVE 0x20 /* resolve immediately the fqdn to an ip address */ + +#endif /* _HAPROXY_SERVER_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/server.h b/include/haproxy/server.h new file mode 100644 index 0000000..2ba6e45 --- /dev/null +++ b/include/haproxy/server.h @@ -0,0 +1,328 @@ +/* + * include/haproxy/server.h + * This file defines everything related to servers. + * + * Copyright (C) 2000-2009 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SERVER_H +#define _HAPROXY_SERVER_H + +#include <unistd.h> + +#include <haproxy/api.h> +#include <haproxy/applet-t.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/proxy-t.h> +#include <haproxy/resolvers-t.h> +#include <haproxy/server-t.h> +#include <haproxy/task.h> +#include <haproxy/thread-t.h> +#include <haproxy/time.h> +#include <haproxy/tools.h> + + +__decl_thread(extern HA_SPINLOCK_T idle_conn_srv_lock); +extern struct idle_conns idle_conns[MAX_THREADS]; +extern struct task *idle_conn_task; +extern struct list servers_list; +extern struct dict server_key_dict; + +int srv_downtime(const struct server *s); +int srv_lastsession(const struct server *s); +int srv_getinter(const struct check *check); +void srv_settings_cpy(struct server *srv, const struct server *src, int srv_tmpl); +int parse_server(const char *file, int linenum, char **args, struct proxy *curproxy, const struct proxy *defproxy, int parse_flags); +int srv_update_addr(struct server *s, void *ip, int ip_sin_family, const char *updater); +int server_parse_sni_expr(struct server *newsrv, struct proxy *px, char **err); +const char *srv_update_addr_port(struct server *s, const char *addr, const char *port, char *updater); +const char *srv_update_check_addr_port(struct server *s, const char *addr, const char *port); +const char *srv_update_agent_addr_port(struct server *s, const char *addr, const char *port); +struct server *server_find_by_id(struct proxy *bk, int id); +struct server *server_find_by_name(struct proxy *bk, const char *name); +struct server *server_find_best_match(struct proxy *bk, char *name, int id, int *diff); +void apply_server_state(void); +void srv_compute_all_admin_states(struct proxy *px); +int srv_set_addr_via_libc(struct server *srv, int *err_code); +int srv_init_addr(void); +struct server *cli_find_server(struct appctx *appctx, char *arg); +struct server *new_server(struct proxy *proxy); +void srv_take(struct server *srv); +struct server *srv_drop(struct server *srv); +void srv_free_params(struct server *srv); +int srv_init_per_thr(struct server *srv); +void srv_set_ssl(struct server *s, int use_ssl); +const char *srv_adm_st_chg_cause(enum srv_adm_st_chg_cause cause); +const char *srv_op_st_chg_cause(enum srv_op_st_chg_cause cause); +void srv_event_hdl_publish_check(struct server *srv, struct check *check); + +/* functions related to server name resolution */ +int srv_prepare_for_resolution(struct server *srv, const char *hostname); +int srvrq_update_srv_status(struct server *s, int has_no_ip); +int snr_update_srv_status(struct server *s, int has_no_ip); +int srv_set_fqdn(struct server *srv, const char *fqdn, int resolv_locked); +const char *srv_update_fqdn(struct server *server, const char *fqdn, const char *updater, int dns_locked); +int snr_resolution_cb(struct resolv_requester *requester, struct dns_counters *counters); +int srvrq_resolution_error_cb(struct resolv_requester *requester, int error_code); +int snr_resolution_error_cb(struct resolv_requester *requester, int error_code); +struct server *snr_check_ip_callback(struct server *srv, void *ip, unsigned char *ip_family); +struct task *srv_cleanup_idle_conns(struct task *task, void *ctx, unsigned int state); +void srv_release_conn(struct server *srv, struct connection *conn); +struct connection *srv_lookup_conn(struct eb_root *tree, uint64_t hash); +struct connection *srv_lookup_conn_next(struct connection *conn); + +void _srv_add_idle(struct server *srv, struct connection *conn, int is_safe); +int srv_add_to_idle_list(struct server *srv, struct connection *conn, int is_safe); +void srv_add_to_avail_list(struct server *srv, struct connection *conn); +struct task *srv_cleanup_toremove_conns(struct task *task, void *context, unsigned int state); + +int srv_apply_track(struct server *srv, struct proxy *curproxy); + +/* + * Registers the server keyword list <kwl> as a list of valid keywords for next + * parsing sessions. + */ +void srv_register_keywords(struct srv_kw_list *kwl); + +/* Return a pointer to the server keyword <kw>, or NULL if not found. */ +struct srv_kw *srv_find_kw(const char *kw); + +/* Dumps all registered "server" keywords to the <out> string pointer. */ +void srv_dump_kws(char **out); + +/* Recomputes the server's eweight based on its state, uweight, the current time, + * and the proxy's algorithm. To be used after updating sv->uweight. The warmup + * state is automatically disabled if the time is elapsed. + */ +void server_recalc_eweight(struct server *sv, int must_update); + +/* + * Parses weight_str and configures sv accordingly. + * Returns NULL on success, error message string otherwise. + */ +const char *server_parse_weight_change_request(struct server *sv, + const char *weight_str); + +/* + * Parses addr_str and configures sv accordingly. updater precise + * the source of the change in the associated message log. + * Returns NULL on success, error message string otherwise. + */ +const char *server_parse_addr_change_request(struct server *sv, + const char *addr_str, const char *updater); + +/* + * Parses maxconn_str and configures sv accordingly. + * Returns NULL on success, error message string otherwise. + */ +const char *server_parse_maxconn_change_request(struct server *sv, + const char *maxconn_str); + +/* Shutdown all connections of a server. The caller must pass a termination + * code in <why>, which must be one of SF_ERR_* indicating the reason for the + * shutdown. + */ +void srv_shutdown_streams(struct server *srv, int why); + +/* Shutdown all connections of all backup servers of a proxy. The caller must + * pass a termination code in <why>, which must be one of SF_ERR_* indicating + * the reason for the shutdown. + */ +void srv_shutdown_backup_streams(struct proxy *px, int why); + +void srv_append_status(struct buffer *msg, struct server *s, struct check *, + int xferred, int forced); + +void srv_set_stopped(struct server *s, enum srv_op_st_chg_cause cause); +void srv_set_running(struct server *s, enum srv_op_st_chg_cause cause); +void srv_set_stopping(struct server *s, enum srv_op_st_chg_cause cause); + +/* Enables admin flag <mode> (among SRV_ADMF_*) on server <s>. This is used to + * enforce either maint mode or drain mode. It is not allowed to set more than + * one flag at once. The equivalent "inherited" flag is propagated to all + * tracking servers. Maintenance mode disables health checks (but not agent + * checks). When either the flag is already set or no flag is passed, nothing + * is done. If <cause> is non-null, it will be displayed at the end of the log + * lines to justify the state change. + */ +void srv_set_admin_flag(struct server *s, enum srv_admin mode, enum srv_adm_st_chg_cause cause); + +/* Disables admin flag <mode> (among SRV_ADMF_*) on server <s>. This is used to + * stop enforcing either maint mode or drain mode. It is not allowed to set more + * than one flag at once. The equivalent "inherited" flag is propagated to all + * tracking servers. Leaving maintenance mode re-enables health checks. When + * either the flag is already cleared or no flag is passed, nothing is done. + */ +void srv_clr_admin_flag(struct server *s, enum srv_admin mode); + +/* Calculates the dynamic persistent cookie for a server, if a secret key has + * been provided. + */ +void srv_set_dyncookie(struct server *s); + +int srv_check_reuse_ws(struct server *srv); +const struct mux_ops *srv_get_ws_proto(struct server *srv); + +/* increase the number of cumulated connections on the designated server */ +static inline void srv_inc_sess_ctr(struct server *s) +{ + _HA_ATOMIC_INC(&s->counters.cum_sess); + HA_ATOMIC_UPDATE_MAX(&s->counters.sps_max, + update_freq_ctr(&s->sess_per_sec, 1)); +} + +/* set the time of last session on the designated server */ +static inline void srv_set_sess_last(struct server *s) +{ + s->counters.last_sess = ns_to_sec(now_ns); +} + +/* returns the current server throttle rate between 0 and 100% */ +static inline unsigned int server_throttle_rate(struct server *sv) +{ + struct proxy *px = sv->proxy; + + /* when uweight is 0, we're in soft-stop so that cannot be a slowstart, + * thus the throttle is 100%. + */ + if (!sv->uweight) + return 100; + + return (100U * px->lbprm.wmult * sv->cur_eweight + px->lbprm.wdiv - 1) / (px->lbprm.wdiv * sv->uweight); +} + +/* + * Return true if the server has a zero user-weight, meaning it's in draining + * mode (ie: not taking new non-persistent connections). + */ +static inline int server_is_draining(const struct server *s) +{ + return !s->uweight || (s->cur_admin & SRV_ADMF_DRAIN); +} + +/* Puts server <s> into maintenance mode, and propagate that status down to all + * tracking servers. + */ +static inline void srv_adm_set_maint(struct server *s) +{ + srv_set_admin_flag(s, SRV_ADMF_FMAINT, SRV_ADM_STCHGC_NONE); + srv_clr_admin_flag(s, SRV_ADMF_FDRAIN); +} + +/* Puts server <s> into drain mode, and propagate that status down to all + * tracking servers. + */ +static inline void srv_adm_set_drain(struct server *s) +{ + srv_set_admin_flag(s, SRV_ADMF_FDRAIN, SRV_ADM_STCHGC_NONE); + srv_clr_admin_flag(s, SRV_ADMF_FMAINT); +} + +/* Puts server <s> into ready mode, and propagate that status down to all + * tracking servers. + */ +static inline void srv_adm_set_ready(struct server *s) +{ + srv_clr_admin_flag(s, SRV_ADMF_FDRAIN); + srv_clr_admin_flag(s, SRV_ADMF_FMAINT); +} + +/* appends an initaddr method to the existing list. Returns 0 on failure. */ +static inline int srv_append_initaddr(unsigned int *list, enum srv_initaddr addr) +{ + int shift = 0; + + while (shift + 3 < 32 && (*list >> shift)) + shift += 3; + + if (shift + 3 > 32) + return 0; + + *list |= addr << shift; + return 1; +} + +/* returns the next initaddr method and removes it from <list> by shifting + * it right (implying that it MUST NOT be the server's. Returns SRV_IADDR_END + * at the end. + */ +static inline enum srv_initaddr srv_get_next_initaddr(unsigned int *list) +{ + enum srv_initaddr ret; + + ret = *list & 7; + *list >>= 3; + return ret; +} + +static inline void srv_use_conn(struct server *srv, struct connection *conn) +{ + unsigned int curr, prev; + + curr = _HA_ATOMIC_ADD_FETCH(&srv->curr_used_conns, 1); + + + /* It's ok not to do that atomically, we don't need an + * exact max. + */ + prev = HA_ATOMIC_LOAD(&srv->max_used_conns); + if (prev < curr) + HA_ATOMIC_STORE(&srv->max_used_conns, curr); + + prev = HA_ATOMIC_LOAD(&srv->est_need_conns); + if (prev < curr) + HA_ATOMIC_STORE(&srv->est_need_conns, curr); +} + +/* checks if minconn and maxconn are consistent to each other + * and automatically adjust them if it is not the case + * This logic was historically implemented in check_config_validity() + * at boot time, but with the introduction of dynamic servers + * this may be used at multiple places in the code now + */ +static inline void srv_minmax_conn_apply(struct server *srv) +{ + if (srv->minconn > srv->maxconn) { + /* Only 'minconn' was specified, or it was higher than or equal + * to 'maxconn'. Let's turn this into maxconn and clean it, as + * this will avoid further useless expensive computations. + */ + srv->maxconn = srv->minconn; + } else if (srv->maxconn && !srv->minconn) { + /* minconn was not specified, so we set it to maxconn */ + srv->minconn = srv->maxconn; + } +} + +/* Returns true if server is used as transparent mode. */ +static inline int srv_is_transparent(const struct server *srv) +{ + /* A reverse server does not have any address but it is not used as a + * transparent one. + */ + return (!is_addr(&srv->addr) && !(srv->flags & SRV_F_RHTTP)) || + (srv->flags & SRV_F_MAPPORTS); +} + +#endif /* _HAPROXY_SERVER_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/session-t.h b/include/haproxy/session-t.h new file mode 100644 index 0000000..dff167e --- /dev/null +++ b/include/haproxy/session-t.h @@ -0,0 +1,78 @@ +/* + * include/haproxy/session-t.h + * This file defines everything related to sessions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SESSION_T_H +#define _HAPROXY_SESSION_T_H + + +#include <sys/time.h> +#include <unistd.h> +#include <netinet/in.h> +#include <arpa/inet.h> + +#include <haproxy/api-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/task-t.h> +#include <haproxy/vars-t.h> + + +/* session flags */ +enum { + SESS_FL_NONE = 0x00000000, /* nothing */ + SESS_FL_PREFER_LAST = 0x00000001, /* NTML authent, we should reuse last conn */ +}; + +/* max number of idle server connections kept attached to a session */ +#define MAX_SRV_LIST 5 + +struct session { + struct proxy *fe; /* the proxy this session depends on for the client side */ + struct listener *listener; /* the listener by which the request arrived */ + enum obj_type *origin; /* the connection / applet which initiated this session */ + struct timeval accept_date; /* date of the session's accept() in user date */ + ullong accept_ts; /* date of the session's accept() in internal date (monotonic) */ + struct stkctr *stkctr; /* stick counters for tcp-connection */ + struct vars vars; /* list of variables for the session scope. */ + struct task *task; /* handshake timeout processing */ + long t_handshake; /* handshake duration, -1 = not completed */ + long t_idle; /* idle duration, -1 if never occurs */ + int idle_conns; /* Number of connections we're currently responsible for that we are not using */ + unsigned int flags; /* session flags, SESS_FL_* */ + struct list srv_list; /* List of servers and the connections the session is currently responsible for */ + struct sockaddr_storage *src; /* source address (pool), when known, otherwise NULL */ + struct sockaddr_storage *dst; /* destination address (pool), when known, otherwise NULL */ +}; + +struct sess_srv_list { + void *target; + struct list conn_list; /* Head of the connections list */ + struct list srv_list; /* Next element of the server list */ +}; + +#endif /* _HAPROXY_SESSION_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/session.h b/include/haproxy/session.h new file mode 100644 index 0000000..38335e4 --- /dev/null +++ b/include/haproxy/session.h @@ -0,0 +1,335 @@ +/* + * include/haproxy/session.h + * This file contains functions used to manage sessions. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SESSION_H +#define _HAPROXY_SESSION_H + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/global-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/pool.h> +#include <haproxy/server.h> +#include <haproxy/session-t.h> +#include <haproxy/stick_table.h> + +extern struct pool_head *pool_head_session; +extern struct pool_head *pool_head_sess_srv_list; + +struct session *session_new(struct proxy *fe, struct listener *li, enum obj_type *origin); +void session_free(struct session *sess); +int session_accept_fd(struct connection *cli_conn); +int conn_complete_session(struct connection *conn); +struct task *session_expire_embryonic(struct task *t, void *context, unsigned int state); + +/* Remove the refcount from the session to the tracked counters, and clear the + * pointer to ensure this is only performed once. The caller is responsible for + * ensuring that the pointer is valid first. + */ +static inline void session_store_counters(struct session *sess) +{ + void *ptr; + int i; + struct stksess *ts; + + if (unlikely(!sess->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + struct stkctr *stkctr = &sess->stkctr[i]; + + ts = stkctr_entry(stkctr); + if (!ts) + continue; + + ptr = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_CONN_CUR); + if (ptr) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (stktable_data_cast(ptr, std_t_uint) > 0) + stktable_data_cast(ptr, std_t_uint)--; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(stkctr->table, ts, 0); + } + + stkctr_set_entry(stkctr, NULL); + stksess_kill_if_expired(stkctr->table, ts, 1); + } +} + +/* Increase the number of cumulated HTTP requests in the tracked counters */ +static inline void session_inc_http_req_ctr(struct session *sess) +{ + int i; + + if (unlikely(!sess->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) + stkctr_inc_http_req_ctr(&sess->stkctr[i]); +} + +/* Increase the number of cumulated failed HTTP requests in the tracked + * counters. Only 4xx requests should be counted here so that we can + * distinguish between errors caused by client behaviour and other ones. + * Note that even 404 are interesting because they're generally caused by + * vulnerability scans. + */ +static inline void session_inc_http_err_ctr(struct session *sess) +{ + int i; + + if (unlikely(!sess->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) + stkctr_inc_http_err_ctr(&sess->stkctr[i]); +} + +/* Increase the number of cumulated failed HTTP responses in the tracked + * counters. Only some 5xx responses should be counted here so that we can + * distinguish between server failures and errors triggered by the client + * (i.e. 501 and 505 may be triggered and must be ignored). + */ +static inline void session_inc_http_fail_ctr(struct session *sess) +{ + int i; + + if (unlikely(!sess->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) + stkctr_inc_http_fail_ctr(&sess->stkctr[i]); +} + + +/* Remove the connection from the session list, and destroy the srv_list if it's now empty */ +static inline void session_unown_conn(struct session *sess, struct connection *conn) +{ + struct sess_srv_list *srv_list = NULL; + + BUG_ON(objt_listener(conn->target)); + + /* WT: this currently is a workaround for an inconsistency between + * the link status of the connection in the session list and the + * connection's owner. This should be removed as soon as all this + * is addressed. Right now it's possible to enter here with a non-null + * conn->owner that points to a dead session, but in this case the + * element is not linked. + */ + if (!LIST_INLIST(&conn->session_list)) + return; + + if (conn->flags & CO_FL_SESS_IDLE) + sess->idle_conns--; + LIST_DEL_INIT(&conn->session_list); + conn->owner = NULL; + list_for_each_entry(srv_list, &sess->srv_list, srv_list) { + if (srv_list->target == conn->target) { + if (LIST_ISEMPTY(&srv_list->conn_list)) { + LIST_DELETE(&srv_list->srv_list); + pool_free(pool_head_sess_srv_list, srv_list); + } + break; + } + } +} + +/* Add the connection <conn> to the server list of the session <sess>. This + * function is called only if the connection is private. Nothing is performed if + * the connection is already in the session sever list or if the session does + * not own the connection. + */ +static inline int session_add_conn(struct session *sess, struct connection *conn, void *target) +{ + struct sess_srv_list *srv_list = NULL; + int found = 0; + + BUG_ON(objt_listener(conn->target)); + + /* Already attach to the session or not the connection owner */ + if (!LIST_ISEMPTY(&conn->session_list) || (conn->owner && conn->owner != sess)) + return 1; + + list_for_each_entry(srv_list, &sess->srv_list, srv_list) { + if (srv_list->target == target) { + found = 1; + break; + } + } + if (!found) { + /* The session has no connection for the server, create a new entry */ + srv_list = pool_alloc(pool_head_sess_srv_list); + if (!srv_list) + return 0; + srv_list->target = target; + LIST_INIT(&srv_list->conn_list); + LIST_APPEND(&sess->srv_list, &srv_list->srv_list); + } + LIST_APPEND(&srv_list->conn_list, &conn->session_list); + return 1; +} + +/* Returns 0 if the session can keep the idle conn, -1 if it was destroyed. The + * connection must be private. + */ +static inline int session_check_idle_conn(struct session *sess, struct connection *conn) +{ + /* Another session owns this connection */ + if (conn->owner != sess) + return 0; + + if (sess->idle_conns >= sess->fe->max_out_conns) { + session_unown_conn(sess, conn); + conn->owner = NULL; + conn->flags &= ~CO_FL_SESS_IDLE; + conn->mux->destroy(conn->ctx); + return -1; + } else { + conn->flags |= CO_FL_SESS_IDLE; + sess->idle_conns++; + } + return 0; +} + +/* Look for an available connection matching the target <target> in the server + * list of the session <sess>. It returns a connection if found. Otherwise it + * returns NULL. + */ +static inline struct connection *session_get_conn(struct session *sess, void *target, int64_t hash) +{ + struct connection *srv_conn = NULL; + struct sess_srv_list *srv_list; + + list_for_each_entry(srv_list, &sess->srv_list, srv_list) { + if (srv_list->target == target) { + list_for_each_entry(srv_conn, &srv_list->conn_list, session_list) { + if ((srv_conn->hash_node && srv_conn->hash_node->node.key == hash) && + srv_conn->mux && + (srv_conn->mux->avail_streams(srv_conn) > 0) && + !(srv_conn->flags & CO_FL_WAIT_XPRT)) { + if (srv_conn->flags & CO_FL_SESS_IDLE) { + srv_conn->flags &= ~CO_FL_SESS_IDLE; + sess->idle_conns--; + } + goto end; + } + } + srv_conn = NULL; /* No available connection found */ + goto end; + } + } + + end: + return srv_conn; +} + +/* Returns the source address of the session and fallbacks on the client + * connection if not set. It returns a const address on success or NULL on + * failure. + */ +static inline const struct sockaddr_storage *sess_src(struct session *sess) +{ + struct connection *cli_conn = objt_conn(sess->origin); + + if (sess->src) + return sess->src; + if (cli_conn && conn_get_src(cli_conn)) + return conn_src(cli_conn); + return NULL; +} + +/* Returns the destination address of the session and fallbacks on the client + * connection if not set. It returns a const address on success or NULL on + * failure. + */ +static inline const struct sockaddr_storage *sess_dst(struct session *sess) +{ + struct connection *cli_conn = objt_conn(sess->origin); + + if (sess->dst) + return sess->dst; + if (cli_conn && conn_get_dst(cli_conn)) + return conn_dst(cli_conn); + return NULL; +} + + +/* Retrieves the source address of the session <sess>. Returns non-zero on + * success or zero on failure. The operation is only performed once and the + * address is stored in the session for future use. On the first call, the + * session source address is copied from the client connection one. + */ +static inline int sess_get_src(struct session *sess) +{ + struct connection *cli_conn = objt_conn(sess->origin); + const struct sockaddr_storage *src = NULL; + + if (sess->src) + return 1; + + if (cli_conn && conn_get_src(cli_conn)) + src = conn_src(cli_conn); + if (!src) + return 0; + + if (!sockaddr_alloc(&sess->src, src, sizeof(*src))) + return 0; + + return 1; +} + + +/* Retrieves the destination address of the session <sess>. Returns non-zero on + * success or zero on failure. The operation is only performed once and the + * address is stored in the session for future use. On the first call, the + * session destination address is copied from the client connection one. + */ +static inline int sess_get_dst(struct session *sess) +{ + struct connection *cli_conn = objt_conn(sess->origin); + const struct sockaddr_storage *dst = NULL; + + if (sess->dst) + return 1; + + if (cli_conn && conn_get_dst(cli_conn)) + dst = conn_dst(cli_conn); + if (!dst) + return 0; + + if (!sockaddr_alloc(&sess->dst, dst, sizeof(*dst))) + return 0; + + return 1; +} + +#endif /* _HAPROXY_SESSION_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/shctx-t.h b/include/haproxy/shctx-t.h new file mode 100644 index 0000000..493024a --- /dev/null +++ b/include/haproxy/shctx-t.h @@ -0,0 +1,63 @@ +/* + * include/haproxy/shctx-t.h - shared context management functions for SSL + * + * Copyright (C) 2011-2012 EXCELIANCE + * + * Author: Emeric Brun - emeric@exceliance.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __HAPROXY_SHCTX_T_H +#define __HAPROXY_SHCTX_T_H + +#include <haproxy/api-t.h> +#include <haproxy/thread-t.h> + +#ifndef SHSESS_BLOCK_MIN_SIZE +#define SHSESS_BLOCK_MIN_SIZE 128 +#endif + +#ifndef SHSESS_MAX_DATA_LEN +#define SHSESS_MAX_DATA_LEN 4096 +#endif + +#ifndef SHCTX_APPNAME +#define SHCTX_APPNAME "haproxy" +#endif + +#define SHCTX_E_ALLOC_CACHE -1 +#define SHCTX_E_INIT_LOCK -2 + +#define SHCTX_F_REMOVING 0x1 /* Removing flag, does not accept new */ + +/* generic shctx struct */ +struct shared_block { + struct list list; + unsigned int len; /* data length for the row */ + unsigned int block_count; /* number of blocks */ + unsigned int refcount; + struct shared_block *last_reserved; + struct shared_block *last_append; + unsigned char data[VAR_ARRAY]; +}; + +struct shared_context { + __decl_thread(HA_RWLOCK_T lock); + struct list avail; /* list for active and free blocks */ + unsigned int nbav; /* number of available blocks */ + unsigned int max_obj_size; /* maximum object size (in bytes). */ + void (*free_block)(struct shared_block *first, void *data); + void (*reserve_finish)(struct shared_context *shctx); + void *cb_data; + short int block_size; + ALWAYS_ALIGN(64); /* The following member needs to be aligned to 64 in the + cache's case because the cache struct contains an explicitly + aligned member (struct cache_tree). */ + unsigned char data[VAR_ARRAY]; +}; + +#endif /* __HAPROXY_SHCTX_T_H */ diff --git a/include/haproxy/shctx.h b/include/haproxy/shctx.h new file mode 100644 index 0000000..a57cf15 --- /dev/null +++ b/include/haproxy/shctx.h @@ -0,0 +1,80 @@ +/* + * include/haproxy/shctx.h - shared context management functions for SSL + * + * Copyright (C) 2011-2012 EXCELIANCE + * + * Author: Emeric Brun - emeric@exceliance.fr + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef __HAPROXY_SHCTX_H +#define __HAPROXY_SHCTX_H + +#include <haproxy/api.h> +#include <haproxy/list.h> +#include <haproxy/shctx-t.h> +#include <haproxy/thread.h> + +int shctx_init(struct shared_context **orig_shctx, + int maxblocks, int blocksize, unsigned int maxobjsz, + int extra); +struct shared_block *shctx_row_reserve_hot(struct shared_context *shctx, + struct shared_block *last, int data_len); +void shctx_row_detach(struct shared_context *shctx, struct shared_block *first); +void shctx_row_reattach(struct shared_context *shctx, struct shared_block *first); +int shctx_row_data_append(struct shared_context *shctx, + struct shared_block *first, + unsigned char *data, int len); +int shctx_row_data_get(struct shared_context *shctx, struct shared_block *first, + unsigned char *dst, int offset, int len); + + +/* Lock functions */ + +static inline void shctx_rdlock(struct shared_context *shctx) +{ + HA_RWLOCK_RDLOCK(SHCTX_LOCK, &shctx->lock); +} +static inline void shctx_rdunlock(struct shared_context *shctx) +{ + HA_RWLOCK_RDUNLOCK(SHCTX_LOCK, &shctx->lock); +} +static inline void shctx_wrlock(struct shared_context *shctx) +{ + HA_RWLOCK_WRLOCK(SHCTX_LOCK, &shctx->lock); +} +static inline void shctx_wrunlock(struct shared_context *shctx) +{ + HA_RWLOCK_WRUNLOCK(SHCTX_LOCK, &shctx->lock); +} + +/* List Macros */ + +/* + * Insert <s> block after <head> which is not necessarily the head of a list, + * so between <head> and the next element after <head>. + */ +static inline void shctx_block_append_hot(struct shared_context *shctx, + struct shared_block *first, + struct shared_block *s) +{ + shctx->nbav--; + LIST_DELETE(&s->list); + LIST_APPEND(&first->list, &s->list); +} + +static inline struct shared_block *shctx_block_detach(struct shared_context *shctx, + struct shared_block *s) +{ + shctx->nbav--; + LIST_DELETE(&s->list); + LIST_INIT(&s->list); + return s; +} + +#endif /* __HAPROXY_SHCTX_H */ + diff --git a/include/haproxy/show_flags-t.h b/include/haproxy/show_flags-t.h new file mode 100644 index 0000000..824d771 --- /dev/null +++ b/include/haproxy/show_flags-t.h @@ -0,0 +1,99 @@ +/* + * include/haproxy/show_flags.h + * These are helper macros used to decode flags for debugging + * + * Copyright (C) 2022 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SHOW_FLAGS_H +#define _HAPROXY_SHOW_FLAGS_H + +/* Only define the macro below if the caller requests it using HA_EXPOSE_FLAGS. + * It will be used by many low-level includes and we don't want to + * include the huge stdio here by default. The macro is used to make a string + * of a set of flags (and handles one flag at a time). It will append into + * <_buf>:<_len> the state of flag <_val> in <_flg>, appending string <_del> as + * delimiters till the last flag is dumped, then updating <_buf> and <_len> + * accordingly. <_nam> is used as the name for value <_val>. <_flg> loses all + * dumped flags. If <_flg> is zero and <_val> is 0, a "0" is reported, this can + * be used as a prologue to the dump. If <_val> contains more than one bit set, + * <_flg>'s hexadecimal output is reported instead of a name. + * + * It is possible to use it to enumerate all flags from right to left so that + * they are easier to check in the code. It will start by executing the optional + * code block in the extra flags (if any) before proceeding with the dump using + * the arguments. It is suggested to locally rename it to a single-char macro + * locally for readability, e.g: + * + * #define _(n, ...) __APPEND_FLAG(buf, len, del, flg, n, #n, __VA_ARGS__) + * _(0); + * _(X_FLAG1, _(X_FLAG2, _(X_FLAG3))); + * _(~0); + * #undef _ + * + * __APPEND_ENUM() works a bit differently in that it takes an additional mask + * to isolate bits to compare to the enum's value, and will remove the mask's + * bits at once in case of match. + */ +#ifdef HA_EXPOSE_FLAGS + +#define __APPEND_FLAG(_buf, _len, _del, _flg, _val, _nam, ...) \ + do { \ + size_t _ret = 0; \ + unsigned int _flg0 = (_flg); \ + do { __VA_ARGS__; } while (0); \ + (_flg) &= ~(unsigned int)(_val); \ + if (!((unsigned int)_val) && !(_flg)) \ + _ret = snprintf(_buf, _len, "0%s", \ + (_flg) ? (_del) : ""); \ + else if ((_flg0) & (_val)) { \ + if ((_val) & ((_val) - 1)) \ + _ret = snprintf(_buf, _len, "%#x%s", \ + (_flg0), (_flg) ? (_del) : ""); \ + else \ + _ret = snprintf(_buf, _len, _nam "%s", \ + (_flg) ? (_del) : ""); \ + } \ + if (_ret < _len) { \ + _len -= _ret; \ + _buf += _ret; \ + } \ + } while (0) + +#define __APPEND_ENUM(_buf, _len, _del, _flg, _msk, _val, _nam, ...) \ + do { \ + size_t _ret = 0; \ + do { __VA_ARGS__; } while (0); \ + if (((_flg) & (_msk)) == (_val)) { \ + (_flg) &= ~(_msk); \ + _ret = snprintf(_buf, _len, _nam "%s", \ + (_flg) ? (_del) : ""); \ + } \ + if (_ret < _len) { \ + _len -= _ret; \ + _buf += _ret; \ + } \ + } while (0) + +#else /* EOF not defined => no stdio, do nothing */ + +#define __APPEND_FLAG(_buf, _len, _del, _flg, _val, _nam, ...) do { } while (0) +#define __APPEND_ENUM(_buf, _len, _del, _flg, _msk, _val, _nam, ...) do { } while (0) + +#endif /* EOF */ + +#endif /* _HAPROXY_SHOW_FLAGS_H */ diff --git a/include/haproxy/signal-t.h b/include/haproxy/signal-t.h new file mode 100644 index 0000000..85d4b33 --- /dev/null +++ b/include/haproxy/signal-t.h @@ -0,0 +1,66 @@ +/* + * include/haproxy/signal-t.h + * Asynchronous signal delivery functions descriptors. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_SIGNAL_T_H +#define _HAPROXY_SIGNAL_T_H + +#include <signal.h> +#include <haproxy/api-t.h> + +/* flags for -> flags */ +#define SIG_F_ONE_SHOOT 0x0001 /* unregister handler before calling it */ +#define SIG_F_TYPE_FCT 0x0002 /* handler is a function + arg */ +#define SIG_F_TYPE_TASK 0x0004 /* handler is a task + reason */ + +/* Define WDTSIG if available */ +#if defined(USE_RT) && (_POSIX_TIMERS > 0) && defined(_POSIX_THREAD_CPUTIME) + + +/* We'll deliver SIGALRM when we've run out of CPU as it's not intercepted by + * gdb by default. + */ +#define WDTSIG SIGALRM + +#endif + +#ifdef USE_THREAD_DUMP +/* The signal to trigger a debug dump on a thread is SIGURG. It has the benefit + * of not stopping gdb by default, so that issuing "show threads" in a process + * being debugged has no adverse effect. + */ +#define DEBUGSIG SIGURG + +#endif + +/* those are highly dynamic and stored in pools */ +struct sig_handler { + struct list list; + void *handler; /* function to call or task to wake up */ + int arg; /* arg to pass to function, or signals*/ + int flags; /* SIG_F_* */ +}; + +/* one per signal */ +struct signal_descriptor { + int count; /* number of times raised */ + struct list handlers; /* sig_handler */ +}; + +#endif /* _HAPROXY_SIGNAL_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/signal.h b/include/haproxy/signal.h new file mode 100644 index 0000000..25a4ef1 --- /dev/null +++ b/include/haproxy/signal.h @@ -0,0 +1,52 @@ +/* + * include/haproxy/signal.h + * Asynchronous signal delivery functions. + * + * Copyright 2000-2010 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_SIGNAL_H +#define _HAPROXY_SIGNAL_H + +#include <signal.h> + +#include <haproxy/api.h> +#include <haproxy/signal-t.h> +#include <haproxy/task-t.h> +#include <haproxy/thread.h> + +extern int signal_queue_len; +extern struct signal_descriptor signal_state[]; + +__decl_thread(extern HA_SPINLOCK_T signals_lock); + +void signal_handler(int sig); +void __signal_process_queue(void); +void deinit_signals(void); +struct sig_handler *signal_register_fct(int sig, void (*fct)(struct sig_handler *), int arg); +struct sig_handler *signal_register_task(int sig, struct task *task, int reason); +void signal_unregister_handler(struct sig_handler *handler); +void signal_unregister_target(int sig, void *target); +void signal_unregister(int sig); +void haproxy_unblock_signals(void); + +static inline void signal_process_queue() +{ + if (unlikely(signal_queue_len > 0)) + __signal_process_queue(); +} + +#endif /* _HAPROXY_SIGNAL_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/sink-t.h b/include/haproxy/sink-t.h new file mode 100644 index 0000000..79a0dda --- /dev/null +++ b/include/haproxy/sink-t.h @@ -0,0 +1,76 @@ +/* + * include/haproxy/sink-t.h + * This file provides definitions for event sinks + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SINK_T_H +#define _HAPROXY_SINK_T_H + +#include <import/ist.h> +#include <haproxy/api-t.h> +#include <haproxy/log-t.h> + +/* A sink may be of 4 distinct types : + * - file descriptor (such as stdout) + * - ring buffer, readable from CLI + */ +enum sink_type { + SINK_TYPE_NEW, // not yet initialized + SINK_TYPE_FD, // events sent to a file descriptor + SINK_TYPE_BUFFER, // events sent to a ring buffer +}; + +struct sink_forward_target { + struct server *srv; // used server + struct appctx *appctx; // appctx of current session + size_t ofs; // ring buffer reader offset + struct sink *sink; // the associated sink + struct sink_forward_target *next; + __decl_thread(HA_SPINLOCK_T lock); // lock to protect current struct +}; + +/* describes the configuration and current state of an event sink */ +struct sink { + struct list sink_list; // position in the sink list + char *name; // sink name + char *desc; // sink description + char *store; // backing-store file when buffer + enum log_fmt fmt; // format expected by the sink + enum sink_type type; // type of storage + uint32_t maxlen; // max message length (truncated above) + struct proxy* forward_px; // internal proxy used to forward (only set when exclusive to sink) + struct sink_forward_target *sft; // sink forward targets + struct task *forward_task; // task to handle forward targets conns + struct sig_handler *forward_sighandler; /* signal handler */ + struct { + struct ring *ring; // used by ring buffer and STRM sender + unsigned int dropped; // dropped events since last one. + int fd; // fd num for FD type sink + __decl_thread(HA_RWLOCK_T lock); // shared/excl for dropped + } ctx; +}; + +#endif /* _HAPROXY_SINK_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/sink.h b/include/haproxy/sink.h new file mode 100644 index 0000000..3b428a1 --- /dev/null +++ b/include/haproxy/sink.h @@ -0,0 +1,97 @@ +/* + * include/haproxy/sink.h + * This file provides declarations for event sinks management + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SINK_H +#define _HAPROXY_SINK_H + +#include <sys/types.h> +#include <haproxy/sink-t.h> +#include <haproxy/thread.h> + +extern struct list sink_list; + +extern struct proxy *sink_proxies_list; + +struct sink *sink_find(const char *name); +struct sink *sink_new_fd(const char *name, const char *desc, enum log_fmt, int fd); +ssize_t __sink_write(struct sink *sink, struct log_header hdr, size_t maxlen, + const struct ist msg[], size_t nmsg); +int sink_announce_dropped(struct sink *sink, struct log_header hdr); + + +/* tries to send <nmsg> message parts from message array <msg> to sink <sink>. + * Formatting according to the sink's preference is done here, unless sink->fmt + * is unspecified, in which case the caller formatting will be used instead. + * + * It will stop writing at <maxlen> instead of sink->maxlen if <maxlen> is + * positive and inferior to sink->maxlen. + * + * Lost messages are accounted for in the sink's counter. If there + * were lost messages, an attempt is first made to indicate it. + * The function returns the number of Bytes effectively sent or announced. + * or <= 0 in other cases. + */ +static inline ssize_t sink_write(struct sink *sink, struct log_header hdr, + size_t maxlen, const struct ist msg[], size_t nmsg) +{ + ssize_t sent; + + if (unlikely(sink->ctx.dropped > 0)) { + /* We need to take an exclusive lock so that other producers + * don't do the same thing at the same time and above all we + * want to be sure others have finished sending their messages + * so that the dropped event arrives exactly at the right + * position. + */ + HA_RWLOCK_WRLOCK(RING_LOCK, &sink->ctx.lock); + sent = sink_announce_dropped(sink, hdr); + HA_RWLOCK_WRUNLOCK(RING_LOCK, &sink->ctx.lock); + + if (!sent) { + /* we failed, we don't try to send our log as if it + * would pass by chance, we'd get disordered events. + */ + goto fail; + } + } + + HA_RWLOCK_RDLOCK(RING_LOCK, &sink->ctx.lock); + sent = __sink_write(sink, hdr, maxlen, msg, nmsg); + HA_RWLOCK_RDUNLOCK(RING_LOCK, &sink->ctx.lock); + + fail: + if (unlikely(sent <= 0)) + HA_ATOMIC_INC(&sink->ctx.dropped); + + return sent; +} + +struct sink *sink_new_from_srv(struct server *srv, const char *from); +int sink_resolve_logger_buffer(struct logger *logger, char **msg); + +#endif /* _HAPROXY_SINK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/sock-t.h b/include/haproxy/sock-t.h new file mode 100644 index 0000000..b843d44 --- /dev/null +++ b/include/haproxy/sock-t.h @@ -0,0 +1,37 @@ +/* + * include/haproxy/sock-t.h + * This file contains type definitions for native (BSD-compatible) sockets. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SOCK_T_H +#define _HAPROXY_SOCK_T_H + +#include <sys/socket.h> +#include <sys/types.h> + +#include <haproxy/api-t.h> + +#endif /* _HAPROXY_SOCK_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/sock.h b/include/haproxy/sock.h new file mode 100644 index 0000000..60e81ec --- /dev/null +++ b/include/haproxy/sock.h @@ -0,0 +1,62 @@ +/* + * include/haproxy/sock.h + * This file contains declarations for native (BSD-compatible) sockets. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SOCK_H +#define _HAPROXY_SOCK_H + +#include <sys/socket.h> +#include <sys/types.h> + +#include <haproxy/api.h> +#include <haproxy/connection-t.h> +#include <haproxy/listener-t.h> +#include <haproxy/sock-t.h> + +int sock_create_server_socket(struct connection *conn); +void sock_enable(struct receiver *rx); +void sock_disable(struct receiver *rx); +void sock_unbind(struct receiver *rx); +int sock_get_src(int fd, struct sockaddr *sa, socklen_t salen, int dir); +int sock_get_dst(int fd, struct sockaddr *sa, socklen_t salen, int dir); +int sock_get_old_sockets(const char *unixsocket); +int sock_find_compatible_fd(const struct receiver *rx); +void sock_drop_unused_old_sockets(); +int sock_accepting_conn(const struct receiver *rx); +struct connection *sock_accept_conn(struct listener *l, int *status); +void sock_accept_iocb(int fd); +void sock_conn_ctrl_init(struct connection *conn); +void sock_conn_ctrl_close(struct connection *conn); +void sock_conn_iocb(int fd); +int sock_conn_check(struct connection *conn); +int sock_drain(struct connection *conn); +int sock_check_events(struct connection *conn, int event_type); +void sock_ignore_events(struct connection *conn, int event_type); +int _sock_supports_reuseport(const struct proto_fam *fam, int type, int protocol); + + +#endif /* _HAPROXY_SOCK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/sock_inet.h b/include/haproxy/sock_inet.h new file mode 100644 index 0000000..6f07e63 --- /dev/null +++ b/include/haproxy/sock_inet.h @@ -0,0 +1,49 @@ +/* + * include/haproxy/sock_inet.h + * This file contains declarations for AF_INET & AF_INET6 sockets. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SOCK_INET_H +#define _HAPROXY_SOCK_INET_H + +#include <sys/socket.h> +#include <sys/types.h> + +#include <haproxy/api.h> + +extern int sock_inet6_v6only_default; +extern int sock_inet_tcp_maxseg_default; +extern int sock_inet6_tcp_maxseg_default; + +extern struct proto_fam proto_fam_inet4; +extern struct proto_fam proto_fam_inet6; + +/* external types */ +struct receiver; + +int sock_inet4_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b); +int sock_inet6_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b); +void sock_inet_set_port(struct sockaddr_storage *addr, int port); +int sock_inet_get_dst(int fd, struct sockaddr *sa, socklen_t salen, int dir); +int sock_inet_is_foreign(int fd, sa_family_t family); +int sock_inet4_make_foreign(int fd); +int sock_inet6_make_foreign(int fd); +int sock_inet_bind_receiver(struct receiver *rx, char **errmsg); + +#endif /* _HAPROXY_SOCK_INET_H */ diff --git a/include/haproxy/sock_unix.h b/include/haproxy/sock_unix.h new file mode 100644 index 0000000..9934341 --- /dev/null +++ b/include/haproxy/sock_unix.h @@ -0,0 +1,36 @@ +/* + * include/haproxy/sock_unix.h + * This file contains declarations for AF_UNIX sockets. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SOCK_UNIX_H +#define _HAPROXY_SOCK_UNIX_H + +#include <sys/socket.h> +#include <sys/types.h> + +#include <haproxy/api.h> +#include <haproxy/receiver-t.h> + +extern struct proto_fam proto_fam_unix; + +int sock_unix_addrcmp(const struct sockaddr_storage *a, const struct sockaddr_storage *b); +int sock_unix_bind_receiver(struct receiver *rx, char **errmsg); + +#endif /* _HAPROXY_SOCK_UNIX_H */ diff --git a/include/haproxy/spoe-t.h b/include/haproxy/spoe-t.h new file mode 100644 index 0000000..2732443 --- /dev/null +++ b/include/haproxy/spoe-t.h @@ -0,0 +1,413 @@ +/* + * include/haproxy/spoe-t.h + * Macros, variables and structures for the SPOE filter. + * + * Copyright (C) 2017 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SPOE_T_H +#define _HAPROXY_SPOE_T_H + +#include <sys/time.h> + +#include <haproxy/buf-t.h> +#include <haproxy/dynbuf-t.h> +#include <haproxy/filters-t.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/sample-t.h> +#include <haproxy/stream-t.h> +#include <haproxy/task-t.h> +#include <haproxy/thread-t.h> + +/* Type of list of messages */ +#define SPOE_MSGS_BY_EVENT 0x01 +#define SPOE_MSGS_BY_GROUP 0x02 + +/* Flags set on the SPOE agent */ +#define SPOE_FL_CONT_ON_ERR 0x00000001 /* Do not stop events processing when an error occurred */ +#define SPOE_FL_PIPELINING 0x00000002 /* Set when SPOE agent supports pipelining (set by default) */ +#define SPOE_FL_ASYNC 0x00000004 /* Set when SPOE agent supports async (set by default) */ +#define SPOE_FL_SND_FRAGMENTATION 0x00000008 /* Set when SPOE agent supports sending fragmented payload */ +#define SPOE_FL_RCV_FRAGMENTATION 0x00000010 /* Set when SPOE agent supports receiving fragmented payload */ +#define SPOE_FL_FORCE_SET_VAR 0x00000020 /* Set when SPOE agent will set all variables from agent (and not only known variables) */ + +/* Flags set on the SPOE context */ +#define SPOE_CTX_FL_CLI_CONNECTED 0x00000001 /* Set after that on-client-session event was processed */ +#define SPOE_CTX_FL_SRV_CONNECTED 0x00000002 /* Set after that on-server-session event was processed */ +#define SPOE_CTX_FL_REQ_PROCESS 0x00000004 /* Set when SPOE is processing the request */ +#define SPOE_CTX_FL_RSP_PROCESS 0x00000008 /* Set when SPOE is processing the response */ +#define SPOE_CTX_FL_FRAGMENTED 0x00000010 /* Set when a fragmented frame is processing */ + +#define SPOE_CTX_FL_PROCESS (SPOE_CTX_FL_REQ_PROCESS|SPOE_CTX_FL_RSP_PROCESS) + +/* Flags set on the SPOE applet */ +#define SPOE_APPCTX_FL_PIPELINING 0x00000001 /* Set if pipelining is supported */ +#define SPOE_APPCTX_FL_ASYNC 0x00000002 /* Set if asynchronous frames is supported */ +#define SPOE_APPCTX_FL_FRAGMENTATION 0x00000004 /* Set if fragmentation is supported */ + +#define SPOE_APPCTX_ERR_NONE 0x00000000 /* no error yet, leave it to zero */ +#define SPOE_APPCTX_ERR_TOUT 0x00000001 /* SPOE applet timeout */ + +/* Flags set on the SPOE frame */ +#define SPOE_FRM_FL_FIN 0x00000001 +#define SPOE_FRM_FL_ABRT 0x00000002 + +/* Masks to get data type or flags value */ +#define SPOE_DATA_T_MASK 0x0F +#define SPOE_DATA_FL_MASK 0xF0 + +/* Flags to set Boolean values */ +#define SPOE_DATA_FL_FALSE 0x00 +#define SPOE_DATA_FL_TRUE 0x10 + +/* All possible states for a SPOE context */ +enum spoe_ctx_state { + SPOE_CTX_ST_NONE = 0, + SPOE_CTX_ST_READY, + SPOE_CTX_ST_ENCODING_MSGS, + SPOE_CTX_ST_SENDING_MSGS, + SPOE_CTX_ST_WAITING_ACK, + SPOE_CTX_ST_DONE, + SPOE_CTX_ST_ERROR, +}; + +/* All possible states for a SPOE applet */ +enum spoe_appctx_state { + SPOE_APPCTX_ST_CONNECT = 0, + SPOE_APPCTX_ST_CONNECTING, + SPOE_APPCTX_ST_IDLE, + SPOE_APPCTX_ST_PROCESSING, + SPOE_APPCTX_ST_SENDING_FRAG_NOTIFY, + SPOE_APPCTX_ST_WAITING_SYNC_ACK, + SPOE_APPCTX_ST_DISCONNECT, + SPOE_APPCTX_ST_DISCONNECTING, + SPOE_APPCTX_ST_EXIT, + SPOE_APPCTX_ST_END, +}; + +/* All supported SPOE actions */ +enum spoe_action_type { + SPOE_ACT_T_SET_VAR = 1, + SPOE_ACT_T_UNSET_VAR, + SPOE_ACT_TYPES, +}; + +/* All supported SPOE events */ +enum spoe_event { + SPOE_EV_NONE = 0, + + /* Request events */ + SPOE_EV_ON_CLIENT_SESS = 1, + SPOE_EV_ON_TCP_REQ_FE, + SPOE_EV_ON_TCP_REQ_BE, + SPOE_EV_ON_HTTP_REQ_FE, + SPOE_EV_ON_HTTP_REQ_BE, + + /* Response events */ + SPOE_EV_ON_SERVER_SESS, + SPOE_EV_ON_TCP_RSP, + SPOE_EV_ON_HTTP_RSP, + + SPOE_EV_EVENTS +}; + +/* Errors triggered by streams */ +enum spoe_context_error { + SPOE_CTX_ERR_NONE = 0, + SPOE_CTX_ERR_TOUT, + SPOE_CTX_ERR_RES, + SPOE_CTX_ERR_TOO_BIG, + SPOE_CTX_ERR_FRAG_FRAME_ABRT, + SPOE_CTX_ERR_INTERRUPT, + SPOE_CTX_ERR_UNKNOWN = 255, + SPOE_CTX_ERRS, +}; + +/* Errors triggered by SPOE applet */ +enum spoe_frame_error { + SPOE_FRM_ERR_NONE = 0, + SPOE_FRM_ERR_IO, + SPOE_FRM_ERR_TOUT, + SPOE_FRM_ERR_TOO_BIG, + SPOE_FRM_ERR_INVALID, + SPOE_FRM_ERR_NO_VSN, + SPOE_FRM_ERR_NO_FRAME_SIZE, + SPOE_FRM_ERR_NO_CAP, + SPOE_FRM_ERR_BAD_VSN, + SPOE_FRM_ERR_BAD_FRAME_SIZE, + SPOE_FRM_ERR_FRAG_NOT_SUPPORTED, + SPOE_FRM_ERR_INTERLACED_FRAMES, + SPOE_FRM_ERR_FRAMEID_NOTFOUND, + SPOE_FRM_ERR_RES, + SPOE_FRM_ERR_UNKNOWN = 99, + SPOE_FRM_ERRS, +}; + +/* Scopes used for variables set by agents. It is a way to be agnotic to vars + * scope. */ +enum spoe_vars_scope { + SPOE_SCOPE_PROC = 0, /* <=> SCOPE_PROC */ + SPOE_SCOPE_SESS, /* <=> SCOPE_SESS */ + SPOE_SCOPE_TXN, /* <=> SCOPE_TXN */ + SPOE_SCOPE_REQ, /* <=> SCOPE_REQ */ + SPOE_SCOPE_RES, /* <=> SCOPE_RES */ +}; + +/* Frame Types sent by HAProxy and by agents */ +enum spoe_frame_type { + SPOE_FRM_T_UNSET = 0, + + /* Frames sent by HAProxy */ + SPOE_FRM_T_HAPROXY_HELLO = 1, + SPOE_FRM_T_HAPROXY_DISCON, + SPOE_FRM_T_HAPROXY_NOTIFY, + + /* Frames sent by the agents */ + SPOE_FRM_T_AGENT_HELLO = 101, + SPOE_FRM_T_AGENT_DISCON, + SPOE_FRM_T_AGENT_ACK +}; + +/* All supported data types */ +enum spoe_data_type { + SPOE_DATA_T_NULL = 0, + SPOE_DATA_T_BOOL, + SPOE_DATA_T_INT32, + SPOE_DATA_T_UINT32, + SPOE_DATA_T_INT64, + SPOE_DATA_T_UINT64, + SPOE_DATA_T_IPV4, + SPOE_DATA_T_IPV6, + SPOE_DATA_T_STR, + SPOE_DATA_T_BIN, + SPOE_DATA_TYPES +}; + + +/* Describe an argument that will be linked to a message. It is a sample fetch, + * with an optional name. */ +struct spoe_arg { + char *name; /* Name of the argument, may be NULL */ + unsigned int name_len; /* The name length, 0 if NULL */ + struct sample_expr *expr; /* Sample expression */ + struct list list; /* Used to chain SPOE args */ +}; + +/* Used during the config parsing only because, when a SPOE agent section is + * parsed, messages/groups can be undefined. */ +struct spoe_placeholder { + char *id; /* SPOE placeholder id */ + struct list list; /* Use to chain SPOE placeholders */ +}; + +/* Used during the config parsing, when SPOE agent section is parsed, to + * register some variable names. */ +struct spoe_var_placeholder { + char *name; /* The variable name */ + struct list list; /* Use to chain SPOE var placeholders */ +}; + +/* Describe a message that will be sent in a NOTIFY frame. A message has a name, + * an argument list (see above) and it is linked to a specific event. */ +struct spoe_message { + char *id; /* SPOE message id */ + unsigned int id_len; /* The message id length */ + struct spoe_agent *agent; /* SPOE agent owning this SPOE message */ + struct spoe_group *group; /* SPOE group owning this SPOE message (can be NULL) */ + struct { + char *file; /* file where the SPOE message appears */ + int line; /* line where the SPOE message appears */ + } conf; /* config information */ + unsigned int nargs; /* # of arguments */ + struct list args; /* Arguments added when the SPOE messages is sent */ + struct list list; /* Used to chain SPOE messages */ + struct list by_evt; /* By event list */ + struct list by_grp; /* By group list */ + + struct list acls; /* ACL declared on this message */ + struct acl_cond *cond; /* acl condition to meet */ + enum spoe_event event; /* SPOE_EV_* */ +}; + +/* Describe a group of messages that will be sent in a NOTIFY frame. A group has + * a name and a list of messages. It can be used by HAProxy, outside events + * processing, mainly in (tcp|http) rules. */ +struct spoe_group { + char *id; /* SPOE group id */ + struct spoe_agent *agent; /* SPOE agent owning this SPOE group */ + struct { + char *file; /* file where the SPOE group appears */ + int line; /* line where the SPOE group appears */ + } conf; /* config information */ + + struct list phs; /* List of placeholders used during conf parsing */ + struct list messages; /* List of SPOE messages that will be sent by this + * group */ + + struct list list; /* Used to chain SPOE groups */ +}; + +/* Describe a SPOE agent. */ +struct spoe_agent { + char *id; /* SPOE agent id (name) */ + struct { + char *file; /* file where the SPOE agent appears */ + int line; /* line where the SPOE agent appears */ + } conf; /* config information */ + union { + struct proxy *be; /* Backend used by this agent */ + char *name; /* Backend name used during conf parsing */ + } b; + struct { + unsigned int hello; /* Max time to receive AGENT-HELLO frame (in SPOE applet) */ + unsigned int idle; /* Max Idle timeout (in SPOE applet) */ + unsigned int processing; /* Max time to process an event (in the main stream) */ + } timeout; + + /* Config info */ + struct spoe_config *spoe_conf; /* SPOE filter config */ + char *var_pfx; /* Prefix used for vars set by the agent */ + char *var_on_error; /* Variable to set when an error occurred, in the TXN scope */ + char *var_t_process; /* Variable to set to report the processing time of the last event/group, in the TXN scope */ + char *var_t_total; /* Variable to set to report the cumulative processing time, in the TXN scope */ + unsigned int flags; /* SPOE_FL_* */ + unsigned int cps_max; /* Maximum # of connections per second */ + unsigned int eps_max; /* Maximum # of errors per second */ + unsigned int max_frame_size; /* Maximum frame size for this agent, before any negotiation */ + unsigned int max_fpa; /* Maximum # of frames handled per applet at once */ + + struct list events[SPOE_EV_EVENTS]; /* List of SPOE messages that will be sent + * for each supported events */ + + struct list groups; /* List of available SPOE groups */ + + struct list messages; /* list of all messages attached to this SPOE agent */ + + /* running info */ + struct { + char *engine_id; /* engine-id string */ + unsigned int frame_size; /* current maximum frame size, only used to encode messages */ + unsigned int processing; + struct freq_ctr processing_per_sec; + + struct freq_ctr conn_per_sec; /* connections per second */ + struct freq_ctr err_per_sec; /* connection errors per second */ + + unsigned int idles; /* # of idle applets */ + struct eb_root idle_applets; /* idle SPOE applets available to process data */ + struct list applets; /* all SPOE applets for this agent */ + struct list sending_queue; /* Queue of streams waiting to send data */ + struct list waiting_queue; /* Queue of streams waiting for a ack, in async mode */ + __decl_thread(HA_SPINLOCK_T lock); + } *rt; + + struct { + unsigned int applets; /* # of SPOE applets */ + unsigned int idles; /* # of idle applets */ + unsigned int nb_sending; /* # of streams waiting to send data */ + unsigned int nb_waiting; /* # of streams waiting for a ack */ + unsigned long long nb_processed; /* # of frames processed by the SPOE */ + unsigned long long nb_errors; /* # of errors during the processing */ + } counters; +}; + +/* SPOE filter configuration */ +struct spoe_config { + char *id; /* The SPOE engine name. If undefined in HAProxy config, + * it will be set with the SPOE agent name */ + struct proxy *proxy; /* Proxy owning the filter */ + struct spoe_agent *agent; /* Agent used by this filter */ + struct proxy agent_fe; /* Agent frontend */ +}; + +/* SPOE context attached to a stream. It is the main structure that handles the + * processing offload */ +struct spoe_context { + struct filter *filter; /* The SPOE filter */ + struct stream *strm; /* The stream that should be offloaded */ + + struct list *events; /* List of messages that will be sent during the stream processing */ + struct list *groups; /* List of available SPOE group */ + + struct buffer buffer; /* Buffer used to store a encoded messages */ + struct buffer_wait buffer_wait; /* position in the list of resources waiting for a buffer */ + struct list list; + + enum spoe_ctx_state state; /* SPOE_CTX_ST_* */ + unsigned int flags; /* SPOE_CTX_FL_* */ + unsigned int status_code; /* SPOE_CTX_ERR_* */ + + unsigned int stream_id; /* stream_id and frame_id are used */ + unsigned int frame_id; /* to map NOTIFY and ACK frames */ + unsigned int process_exp; /* expiration date to process an event */ + + struct spoe_appctx *spoe_appctx; /* SPOE appctx sending the current frame */ + struct { + struct spoe_message *curmsg; /* SPOE message from which to resume encoding */ + struct spoe_arg *curarg; /* SPOE arg in <curmsg> from which to resume encoding */ + unsigned int curoff; /* offset in <curarg> from which to resume encoding */ + unsigned int curlen; /* length of <curarg> need to be encode, for SMP_F_MAY_CHANGE data */ + unsigned int flags; /* SPOE_FRM_FL_* */ + } frag_ctx; /* Info about fragmented frames, valid on if SPOE_CTX_FL_FRAGMENTED is set */ + + struct { + ullong start_ts; /* start date of the current event/group */ + ullong request_ts; /* date the frame processing starts (reset for each frag) */ + ullong queue_ts; /* date the frame is queued (reset for each frag) */ + ullong wait_ts; /* date the stream starts waiting for a response */ + ullong response_ts; /* date the response processing starts */ + long t_request; /* delay to encode and push the frame in queue (cumulative for frags) */ + long t_queue; /* delay before the frame gets out the sending queue (cumulative for frags) */ + long t_waiting; /* delay before the response is received */ + long t_response; /* delay to process the response (from the stream pov) */ + long t_process; /* processing time of the last event/group */ + unsigned long t_total; /* cumulative processing time */ + } stats; /* Stats for this stream */ +}; + +/* SPOE context inside a appctx */ +struct spoe_appctx { + struct appctx *owner; /* the owner */ + struct task *task; /* task to handle applet timeouts */ + struct spoe_agent *agent; /* agent on which the applet is attached */ + + unsigned int version; /* the negotiated version */ + unsigned int max_frame_size; /* the negotiated max-frame-size value */ + unsigned int flags; /* SPOE_APPCTX_FL_* */ + + unsigned int status_code; /* SPOE_FRM_ERR_* */ +#if defined(DEBUG_SPOE) || defined(DEBUG_FULL) + char *reason; /* Error message, used for debugging only */ + int rlen; /* reason length */ +#endif + + struct buffer buffer; /* Buffer used to store a encoded messages */ + struct buffer_wait buffer_wait; /* position in the list of resources waiting for a buffer */ + struct list waiting_queue; /* list of streams waiting for a ACK frame, in sync and pipelining mode */ + struct list list; /* next spoe appctx for the same agent */ + struct eb32_node node; /* node used for applets tree */ + unsigned int cur_fpa; + + struct { + struct spoe_context *ctx; /* SPOE context owning the fragmented frame */ + unsigned int cursid; /* stream-id of the fragmented frame. used if the processing is aborted */ + unsigned int curfid; /* frame-id of the fragmented frame. used if the processing is aborted */ + } frag_ctx; /* Info about fragmented frames, unused for unfragmented frames */ +}; + +#endif /* _HAPROXY_SPOE_T_H */ diff --git a/include/haproxy/spoe.h b/include/haproxy/spoe.h new file mode 100644 index 0000000..7cd0987 --- /dev/null +++ b/include/haproxy/spoe.h @@ -0,0 +1,351 @@ +/* + * include/haproxy/spoe.h + * Encoding/Decoding functions for the SPOE filters (and other helpers). + * + * Copyright (C) 2017 HAProxy Technologies, Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SPOE_H +#define _HAPROXY_SPOE_H + +#include <haproxy/api.h> +#include <haproxy/intops.h> +#include <haproxy/sample-t.h> +#include <haproxy/spoe-t.h> + + +/* Encode a buffer. Its length <len> is encoded as a varint, followed by a copy + * of <str>. It must have enough space in <*buf> to encode the buffer, else an + * error is triggered. + * On success, it returns <len> and <*buf> is moved after the encoded value. If + * an error occurred, it returns -1. */ +static inline int +spoe_encode_buffer(const char *str, size_t len, char **buf, char *end) +{ + char *p = *buf; + int ret; + + if (p >= end) + return -1; + + if (!len) { + *p++ = 0; + *buf = p; + return 0; + } + + ret = encode_varint(len, &p, end); + if (ret == -1 || p + len > end) + return -1; + + memcpy(p, str, len); + *buf = p + len; + return len; +} + +/* Encode a buffer, possibly partially. It does the same thing than + * 'spoe_encode_buffer', but if there is not enough space, it does not fail. + * On success, it returns the number of copied bytes and <*buf> is moved after + * the encoded value. If an error occurred, it returns -1. */ +static inline int +spoe_encode_frag_buffer(const char *str, size_t len, char **buf, char *end) +{ + char *p = *buf; + int ret; + + if (p >= end) + return -1; + + if (!len) { + *p++ = 0; + *buf = p; + return 0; + } + + ret = encode_varint(len, &p, end); + if (ret == -1 || p >= end) + return -1; + + ret = (p+len < end) ? len : (end - p); + memcpy(p, str, ret); + *buf = p + ret; + return ret; +} + +/* Decode a buffer. The buffer length is decoded and saved in <*len>. <*str> + * points on the first byte of the buffer. + * On success, it returns the buffer length and <*buf> is moved after the + * encoded buffer. Otherwise, it returns -1. */ +static inline int +spoe_decode_buffer(char **buf, char *end, char **str, uint64_t *len) +{ + char *p = *buf; + uint64_t sz; + int ret; + + *str = NULL; + *len = 0; + + ret = decode_varint(&p, end, &sz); + if (ret == -1 || p + sz > end) + return -1; + + *str = p; + *len = sz; + *buf = p + sz; + return sz; +} + +/* Encode a typed data using value in <smp>. On success, it returns the number + * of copied bytes and <*buf> is moved after the encoded value. If an error + * occurred, it returns -1. + * + * If the value is too big to be encoded, depending on its type, then encoding + * failed or the value is partially encoded. Only strings and binaries can be + * partially encoded. */ +static inline int +spoe_encode_data(struct sample *smp, char **buf, char *end) +{ + char *p = *buf; + int ret; + + if (p >= end) + return -1; + + if (smp == NULL) { + *p++ = SPOE_DATA_T_NULL; + goto end; + } + + switch (smp->data.type) { + case SMP_T_BOOL: + *p = SPOE_DATA_T_BOOL; + *p++ |= ((!smp->data.u.sint) ? SPOE_DATA_FL_FALSE : SPOE_DATA_FL_TRUE); + break; + + case SMP_T_SINT: + *p++ = SPOE_DATA_T_INT64; + if (encode_varint(smp->data.u.sint, &p, end) == -1) + return -1; + break; + + case SMP_T_IPV4: + if (p + 5 > end) + return -1; + *p++ = SPOE_DATA_T_IPV4; + memcpy(p, &smp->data.u.ipv4, 4); + p += 4; + break; + + case SMP_T_IPV6: + if (p + 17 > end) + return -1; + *p++ = SPOE_DATA_T_IPV6; + memcpy(p, &smp->data.u.ipv6, 16); + p += 16; + break; + + case SMP_T_STR: + case SMP_T_BIN: { + /* If defined, get length and offset of the sample by reading the sample + * context. ctx.a[0] is the pointer to the length and ctx.a[1] is the + * pointer to the offset. If the offset is greater than 0, it means the + * sample is partially encoded. In this case, we only need to encode the + * remaining. When all the sample is encoded, the offset is reset to 0. + * So the caller know it can try to encode the next sample. */ + struct buffer *chk = &smp->data.u.str; + unsigned int *len = smp->ctx.a[0]; + unsigned int *off = smp->ctx.a[1]; + + if (!*off) { + /* First evaluation of the sample : encode the + * type (string or binary), the buffer length + * (as a varint) and at least 1 byte of the + * buffer. */ + struct buffer *chk = &smp->data.u.str; + + *p++ = (smp->data.type == SMP_T_STR) + ? SPOE_DATA_T_STR + : SPOE_DATA_T_BIN; + ret = spoe_encode_frag_buffer(chk->area, + chk->data, &p, + end); + if (ret == -1) + return -1; + *len = chk->data; + } + else { + /* The sample has been fragmented, encode remaining data */ + ret = MIN(*len - *off, end - p); + memcpy(p, chk->area + *off, ret); + p += ret; + } + /* Now update <*off> */ + if (ret + *off != *len) + *off += ret; + else + *off = 0; + break; + } + + case SMP_T_METH: { + char *m; + size_t len; + + *p++ = SPOE_DATA_T_STR; + switch (smp->data.u.meth.meth) { + case HTTP_METH_OPTIONS: m = "OPTIONS"; len = 7; break; + case HTTP_METH_GET : m = "GET"; len = 3; break; + case HTTP_METH_HEAD : m = "HEAD"; len = 4; break; + case HTTP_METH_POST : m = "POST"; len = 4; break; + case HTTP_METH_PUT : m = "PUT"; len = 3; break; + case HTTP_METH_DELETE : m = "DELETE"; len = 6; break; + case HTTP_METH_TRACE : m = "TRACE"; len = 5; break; + case HTTP_METH_CONNECT: m = "CONNECT"; len = 7; break; + + default : + m = smp->data.u.meth.str.area; + len = smp->data.u.meth.str.data; + } + if (spoe_encode_buffer(m, len, &p, end) == -1) + return -1; + break; + } + + default: + *p++ = SPOE_DATA_T_NULL; + break; + } + + end: + ret = (p - *buf); + *buf = p; + return ret; +} + +/* Skip a typed data. If an error occurred, -1 is returned, otherwise the number + * of skipped bytes is returned and the <*buf> is moved after skipped data. + * + * A types data is composed of a type (1 byte) and corresponding data: + * - boolean: non additional data (0 bytes) + * - integers: a variable-length integer (see decode_varint) + * - ipv4: 4 bytes + * - ipv6: 16 bytes + * - binary and string: a buffer prefixed by its size, a variable-length + * integer (see spoe_decode_buffer) */ +static inline int +spoe_skip_data(char **buf, char *end) +{ + char *str, *p = *buf; + int type, ret; + uint64_t v, sz; + + if (p >= end) + return -1; + + type = *p++; + switch (type & SPOE_DATA_T_MASK) { + case SPOE_DATA_T_BOOL: + break; + case SPOE_DATA_T_INT32: + case SPOE_DATA_T_INT64: + case SPOE_DATA_T_UINT32: + case SPOE_DATA_T_UINT64: + if (decode_varint(&p, end, &v) == -1) + return -1; + break; + case SPOE_DATA_T_IPV4: + if (p+4 > end) + return -1; + p += 4; + break; + case SPOE_DATA_T_IPV6: + if (p+16 > end) + return -1; + p += 16; + break; + case SPOE_DATA_T_STR: + case SPOE_DATA_T_BIN: + /* All the buffer must be skipped */ + if (spoe_decode_buffer(&p, end, &str, &sz) == -1) + return -1; + break; + } + + ret = (p - *buf); + *buf = p; + return ret; +} + +/* Decode a typed data and fill <smp>. If an error occurred, -1 is returned, + * otherwise the number of read bytes is returned and <*buf> is moved after the + * decoded data. See spoe_skip_data for details. */ +static inline int +spoe_decode_data(char **buf, char *end, struct sample *smp) +{ + char *str, *p = *buf; + int type, r = 0; + uint64_t sz; + + if (p >= end) + return -1; + + type = *p++; + switch (type & SPOE_DATA_T_MASK) { + case SPOE_DATA_T_BOOL: + smp->data.u.sint = ((type & SPOE_DATA_FL_MASK) == SPOE_DATA_FL_TRUE); + smp->data.type = SMP_T_BOOL; + break; + case SPOE_DATA_T_INT32: + case SPOE_DATA_T_INT64: + case SPOE_DATA_T_UINT32: + case SPOE_DATA_T_UINT64: + if (decode_varint(&p, end, (uint64_t *)&smp->data.u.sint) == -1) + return -1; + smp->data.type = SMP_T_SINT; + break; + case SPOE_DATA_T_IPV4: + if (p+4 > end) + return -1; + smp->data.type = SMP_T_IPV4; + memcpy(&smp->data.u.ipv4, p, 4); + p += 4; + break; + case SPOE_DATA_T_IPV6: + if (p+16 > end) + return -1; + memcpy(&smp->data.u.ipv6, p, 16); + smp->data.type = SMP_T_IPV6; + p += 16; + break; + case SPOE_DATA_T_STR: + case SPOE_DATA_T_BIN: + /* All the buffer must be decoded */ + if (spoe_decode_buffer(&p, end, &str, &sz) == -1) + return -1; + smp->data.u.str.area = str; + smp->data.u.str.data = sz; + smp->data.type = (type == SPOE_DATA_T_STR) ? SMP_T_STR : SMP_T_BIN; + break; + } + + r = (p - *buf); + *buf = p; + return r; +} + +#endif /* _HAPROXY_SPOE_H */ diff --git a/include/haproxy/ssl_ckch-t.h b/include/haproxy/ssl_ckch-t.h new file mode 100644 index 0000000..0002b84 --- /dev/null +++ b/include/haproxy/ssl_ckch-t.h @@ -0,0 +1,161 @@ +/* + * include/haproxy/ssl_ckch-t.h + * ckch structures + * + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + +/* The ckch (cert key and chain) structures are a group of structures used to + * cache and manipulate the certificates files loaded from the configuration + * file and the CLI Every certificate change made in a SSL_CTX should be done + * in these structures before being applied to a SSL_CTX. + * + * The complete architecture is described in doc/internals/ssl_cert.dia + */ + + +#ifndef _HAPROXY_SSL_CKCH_T_H +#define _HAPROXY_SSL_CKCH_T_H +#ifdef USE_OPENSSL + +#include <import/ebtree-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/openssl-compat.h> + +/* This is used to preload the certificate, private key + * and Cert Chain of a file passed in via the crt + * argument + * + * This way, we do not have to read the file multiple times + * + * This structure is the base one, in the case of a multi-cert bundle, we + * allocate 1 structure per type. + */ +struct ckch_data { + X509 *cert; + EVP_PKEY *key; + STACK_OF(X509) *chain; + HASSL_DH *dh; + struct buffer *sctl; + struct buffer *ocsp_response; + X509 *ocsp_issuer; + OCSP_CERTID *ocsp_cid; + int ocsp_update_mode; +}; + +/* + * this is used to store 1 to SSL_SOCK_NUM_KEYTYPES cert_key_and_chain and + * metadata. + * + * "ckch" for cert, key and chain. + * + * XXX: Once we remove the multi-cert bundle support, we could merge this structure + * with the cert_key_and_chain one. + */ +struct ckch_store { + struct ckch_data *data; + struct list ckch_inst; /* list of ckch_inst which uses this ckch_node */ + struct list crtlist_entry; /* list of entries which use this store */ + struct ebmb_node node; + char path[VAR_ARRAY]; +}; + +/* forward declarations for ckch_inst */ +struct ssl_bind_conf; +struct crtlist_entry; + + +/* Used to keep a list of all the instances using a specific cafile_entry. + * It enables to link instances regardless of how they are using the CA file + * (either via the ca-file, ca-verify-file or crl-file option). */ +struct ckch_inst_link { + struct ckch_inst *ckch_inst; + struct list list; +}; + +/* Used to keep in a ckch instance a list of all the ckch_inst_link which + * reference it. This way, when deleting a ckch_inst, we can ensure that no + * dangling reference on it will remain. */ +struct ckch_inst_link_ref { + struct ckch_inst_link *link; + struct list list; +}; + +/* + * This structure describe a ckch instance. An instance is generated for each + * bind_conf. The instance contains a linked list of the sni ctx which uses + * the ckch in this bind_conf. + */ +struct ckch_inst { + struct bind_conf *bind_conf; /* pointer to the bind_conf that uses this ckch_inst */ + struct ssl_bind_conf *ssl_conf; /* pointer to the ssl_conf which is used by every sni_ctx of this inst */ + struct ckch_store *ckch_store; /* pointer to the store used to generate this inst */ + struct crtlist_entry *crtlist_entry; /* pointer to the crtlist_entry used, or NULL */ + struct server *server; /* pointer to the server if is_server_instance is set, NULL otherwise */ + SSL_CTX *ctx; /* pointer to the SSL context used by this instance */ + unsigned int is_default:1; /* This instance is used as the default ctx for this bind_conf */ + unsigned int is_server_instance:1; /* This instance is used by a backend server */ + /* space for more flag there */ + struct list sni_ctx; /* list of sni_ctx using this ckch_inst */ + struct list by_ckchs; /* chained in ckch_store's list of ckch_inst */ + struct list by_crtlist_entry; /* chained in crtlist_entry list of inst */ + struct list cafile_link_refs; /* list of ckch_inst_link pointing to this instance */ +}; + + +/* Option through which a cafile_entry was created, either + * ca-file/ca-verify-file or crl-file. */ +enum cafile_type { + CAFILE_CERT, + CAFILE_CRL +}; + +/* + * deduplicate cafile (and crlfile) + */ +struct cafile_entry { + X509_STORE *ca_store; + STACK_OF(X509_NAME) *ca_list; + struct list ckch_inst_link; /* list of ckch_inst which use this CA file entry */ + enum cafile_type type; + struct ebmb_node node; + char path[0]; +}; + +enum { + CERT_TYPE_PEM = 0, + CERT_TYPE_KEY, +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) || defined OPENSSL_IS_BORINGSSL) + CERT_TYPE_OCSP, +#endif + CERT_TYPE_ISSUER, +#ifdef HAVE_SSL_SCTL + CERT_TYPE_SCTL, +#endif + CERT_TYPE_MAX, +}; + +struct cert_exts { + const char *ext; + int type; + int (*load)(const char *path, char *payload, struct ckch_data *data, char **err); + /* add a parsing callback */ +}; + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_CKCH_T_H */ diff --git a/include/haproxy/ssl_ckch.h b/include/haproxy/ssl_ckch.h new file mode 100644 index 0000000..64ac3df --- /dev/null +++ b/include/haproxy/ssl_ckch.h @@ -0,0 +1,75 @@ +/* + * include/haproxy/ssl_ckch.h + * ckch function prototypes + * + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_CKCH_H +#define _HAPROXY_SSL_CKCH_H +#ifdef USE_OPENSSL + +#include <haproxy/ssl_ckch-t.h> + +/* cert_key_and_chain functions */ + +int ssl_sock_load_files_into_ckch(const char *path, struct ckch_data *data, char **err); +int ssl_sock_load_pem_into_ckch(const char *path, char *buf, struct ckch_data *datackch , char **err); +void ssl_sock_free_cert_key_and_chain_contents(struct ckch_data *data); + +int ssl_sock_load_key_into_ckch(const char *path, char *buf, struct ckch_data *data , char **err); +int ssl_sock_load_ocsp_response_from_file(const char *ocsp_path, char *buf, struct ckch_data *data, char **err); +int ssl_sock_load_sctl_from_file(const char *sctl_path, char *buf, struct ckch_data *data, char **err); +int ssl_sock_load_issuer_file_into_ckch(const char *path, char *buf, struct ckch_data *data, char **err); + +/* ckch_store functions */ +struct ckch_store *ckchs_load_cert_file(char *path, char **err); +struct ckch_store *ckchs_lookup(char *path); +struct ckch_store *ckchs_dup(const struct ckch_store *src); +struct ckch_store *ckch_store_new(const char *filename); +void ckch_store_free(struct ckch_store *store); +void ckch_store_replace(struct ckch_store *old_ckchs, struct ckch_store *new_ckchs); + +/* ckch_inst functions */ +void ckch_inst_free(struct ckch_inst *inst); +struct ckch_inst *ckch_inst_new(); +int ckch_inst_new_load_store(const char *path, struct ckch_store *ckchs, struct bind_conf *bind_conf, + struct ssl_bind_conf *ssl_conf, char **sni_filter, int fcount, struct ckch_inst **ckchi, char **err); +int ckch_inst_new_load_srv_store(const char *path, struct ckch_store *ckchs, + struct ckch_inst **ckchi, char **err); +int ckch_inst_rebuild(struct ckch_store *ckch_store, struct ckch_inst *ckchi, + struct ckch_inst **new_inst, char **err); + +void ckch_deinit(); +void ckch_inst_add_cafile_link(struct ckch_inst *ckch_inst, struct bind_conf *bind_conf, + struct ssl_bind_conf *ssl_conf, const struct server *srv); + +/* ssl_store functions */ +struct cafile_entry *ssl_store_get_cafile_entry(char *path, int oldest_entry); +X509_STORE* ssl_store_get0_locations_file(char *path); +int ssl_store_add_uncommitted_cafile_entry(struct cafile_entry *entry); +struct cafile_entry *ssl_store_create_cafile_entry(char *path, X509_STORE *store, enum cafile_type type); +struct cafile_entry *ssl_store_dup_cafile_entry(struct cafile_entry *src); +void ssl_store_delete_cafile_entry(struct cafile_entry *ca_e); +int ssl_store_load_ca_from_buf(struct cafile_entry *ca_e, char *cert_buf, int append); +int ssl_store_load_locations_file(char *path, int create_if_none, enum cafile_type type); +int __ssl_store_load_locations_file(char *path, int create_if_none, enum cafile_type type, int shuterror); + +extern struct cert_exts cert_exts[]; + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_CRTLIST_H */ diff --git a/include/haproxy/ssl_crtlist-t.h b/include/haproxy/ssl_crtlist-t.h new file mode 100644 index 0000000..dc7a376 --- /dev/null +++ b/include/haproxy/ssl_crtlist-t.h @@ -0,0 +1,63 @@ +/* + * include/haproxy/ssl_crtlist-t.h + * crt-list structures + * + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_CRTLIST_T_H +#define _HAPROXY_SSL_CRTLIST_T_H +#ifdef USE_OPENSSL + +#include <import/ebtree-t.h> + + +/* forward declarations for structures below */ +struct bind_conf; +struct ssl_bind_conf; +struct proxy; + +/* list of bind conf used by struct crtlist */ +struct bind_conf_list { + struct bind_conf *bind_conf; + struct bind_conf_list *next; +}; + +/* This structure is basically a crt-list or a directory */ +struct crtlist { + struct bind_conf_list *bind_conf; /* list of bind_conf which use this crtlist */ + unsigned int linecount; /* number of lines */ + struct eb_root entries; + struct list ord_entries; /* list to keep the line order of the crt-list file */ + struct ebmb_node node; /* key is the filename or directory */ +}; + +/* a file in a directory or a line in a crt-list */ +struct crtlist_entry { + struct ssl_bind_conf *ssl_conf; /* SSL conf in crt-list */ + unsigned int linenum; + unsigned int fcount; /* filters count */ + char **filters; + struct crtlist *crtlist; /* ptr to the parent crtlist */ + struct list ckch_inst; /* list of instances of this entry, there is 1 ckch_inst per instance of the crt-list */ + struct list by_crtlist; /* ordered entries */ + struct list by_ckch_store; /* linked in ckch_store list of crtlist_entries */ + struct ebpt_node node; /* key is a ptr to a ckch_store */ +}; + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_CRTLIST_T_H */ diff --git a/include/haproxy/ssl_crtlist.h b/include/haproxy/ssl_crtlist.h new file mode 100644 index 0000000..961cfc3 --- /dev/null +++ b/include/haproxy/ssl_crtlist.h @@ -0,0 +1,48 @@ +/* + * include/haproxy/ssl_crtlist.h + * crt-list function prototypes + * + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_CRTLIST_H +#define _HAPROXY_SSL_CRTLIST_H +#ifdef USE_OPENSSL + +#include <haproxy/ssl_crtlist-t.h> + + +/* crt-list entry functions */ +void ssl_sock_free_ssl_conf(struct ssl_bind_conf *conf); +char **crtlist_dup_filters(char **args, int fcount); +void crtlist_free_filters(char **args); +void crtlist_entry_free(struct crtlist_entry *entry); +struct crtlist_entry *crtlist_entry_new(); + +/* crt-list functions */ +void crtlist_free(struct crtlist *crtlist); +struct crtlist *crtlist_new(const char *filename, int unique); + +/* file loading */ +int crtlist_parse_line(char *line, char **crt_path, struct crtlist_entry *entry, const char *file, int linenum, int from_cli, char **err); +int crtlist_parse_file(char *file, struct bind_conf *bind_conf, struct proxy *curproxy, struct crtlist **crtlist, char **err); +int crtlist_load_cert_dir(char *path, struct bind_conf *bind_conf, struct crtlist **crtlist, char **err); + +void crtlist_deinit(); + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_CRTLIST_H */ diff --git a/include/haproxy/ssl_ocsp-t.h b/include/haproxy/ssl_ocsp-t.h new file mode 100644 index 0000000..fc2750b --- /dev/null +++ b/include/haproxy/ssl_ocsp-t.h @@ -0,0 +1,94 @@ +/* + * include/haproxy/ssl_ocsp-t.h + * SSL structures related to OCSP + * + * Copyright (C) 2022 Remi Tricot-Le Breton - rlebreton@haproxy.com + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_OCSP_T_H +#define _HAPROXY_SSL_OCSP_T_H +#ifdef USE_OPENSSL + +#include <import/ebtree-t.h> + +#include <haproxy/buf-t.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/ssl_sock-t.h> + +#ifndef OPENSSL_NO_OCSP +extern int ocsp_ex_index; +#endif + +#define SSL_OCSP_UPDATE_DELAY_MAX 60*60 /* 1H */ +#define SSL_OCSP_UPDATE_DELAY_MIN 5*60 /* 5 minutes */ +#define SSL_OCSP_UPDATE_MARGIN 60 /* 1 minute */ +#define SSL_OCSP_HTTP_ERR_REPLAY 60 /* 1 minute */ + +#if (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) +/* + * struct alignment works here such that the key.key is the same as key_data + * Do not change the placement of key_data + */ +struct certificate_ocsp { + struct ebmb_node key; + unsigned char key_data[OCSP_MAX_CERTID_ASN1_LENGTH]; + unsigned int key_length; + int refcount_store; /* Number of ckch_store that reference this certificate_ocsp */ + int refcount_instance; /* Number of ckch_inst that reference this certificate_ocsp */ + struct buffer response; + long expire; + X509 *issuer; + STACK_OF(X509) *chain; + struct eb64_node next_update; /* Key of items inserted in ocsp_update_tree (sorted by absolute date) */ + struct buffer *uri; /* First OCSP URI contained in the corresponding certificate */ + + /* OCSP update stats */ + u64 last_update; /* Time of last successful update */ + unsigned int last_update_status;/* Status of the last OCSP update */ + unsigned int num_success; /* Number of successful updates */ + unsigned int num_failure; /* Number of failed updates */ + unsigned int fail_count:30; /* Number of successive failures */ + unsigned int update_once:1; /* Set if an entry should not be reinserted into te tree after update */ + unsigned int updating:1; /* Set if an entry is already being updated */ + char path[VAR_ARRAY]; +}; + +struct ocsp_cbk_arg { + int is_single; + int single_kt; + union { + struct certificate_ocsp *s_ocsp; + /* + * m_ocsp will have multiple entries dependent on key type + * Entry 0 - DSA + * Entry 1 - ECDSA + * Entry 2 - RSA + */ + struct certificate_ocsp *m_ocsp[SSL_SOCK_NUM_KEYTYPES]; + }; +}; + +extern struct eb_root cert_ocsp_tree; +extern struct eb_root ocsp_update_tree; +extern struct task *ocsp_update_task; + +__decl_thread(extern HA_SPINLOCK_T ocsp_tree_lock); + +#endif /* (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) */ + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_OCSP_T_H */ diff --git a/include/haproxy/ssl_ocsp.h b/include/haproxy/ssl_ocsp.h new file mode 100644 index 0000000..8a4197c --- /dev/null +++ b/include/haproxy/ssl_ocsp.h @@ -0,0 +1,70 @@ +/* + * include/haproxy/ssl_ocsp.h + * This file contains definition for ssl OCSP operations + * + * Copyright (C) 2022 Remi Tricot-Le Breton - rlebreton@haproxy.com + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_OCSP_H +#define _HAPROXY_SSL_OCSP_H +#ifdef USE_OPENSSL + +#include <haproxy/openssl-compat.h> +#include <haproxy/ssl_ckch-t.h> +#include <haproxy/ssl_crtlist-t.h> +#include <haproxy/ssl_ocsp-t.h> + +#if (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) + +int ssl_ocsp_build_response_key(OCSP_CERTID *ocsp_cid, unsigned char certid[OCSP_MAX_CERTID_ASN1_LENGTH], unsigned int *key_length); + +int ssl_sock_get_ocsp_arg_kt_index(int evp_keytype); +int ssl_sock_ocsp_stapling_cbk(SSL *ssl, void *arg); + +void ssl_sock_free_ocsp(struct certificate_ocsp *ocsp); +void ssl_sock_free_ocsp_instance(struct certificate_ocsp *ocsp); + +int ssl_sock_load_ocsp_response(struct buffer *ocsp_response, + struct certificate_ocsp *ocsp, + OCSP_CERTID *cid, char **err); +int ssl_sock_update_ocsp_response(struct buffer *ocsp_response, char **err); +void ssl_sock_ocsp_free_func(void *parent, void *ptr, CRYPTO_EX_DATA *ad, int idx, long argl, void *argp); + +int ssl_ocsp_get_uri_from_cert(X509 *cert, struct buffer *out, char **err); +int ssl_ocsp_create_request_details(const OCSP_CERTID *certid, struct buffer *req_url, + struct buffer *req_body, char **err); +int ssl_ocsp_check_response(STACK_OF(X509) *chain, X509 *issuer, + struct buffer *respbuf, char **err); + +int ssl_create_ocsp_update_task(char **err); +void ssl_destroy_ocsp_update_task(void); + +int ssl_ocsp_update_insert(struct certificate_ocsp *ocsp); + +int ocsp_update_check_cfg_consistency(struct ckch_store *store, struct crtlist_entry *entry, char *crt_path, char **err); + +#endif /* (defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) */ + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_OCSP_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/ssl_sock-t.h b/include/haproxy/ssl_sock-t.h new file mode 100644 index 0000000..fdf41a7 --- /dev/null +++ b/include/haproxy/ssl_sock-t.h @@ -0,0 +1,323 @@ +/* + * include/haproxy/ssl_sock-t.h + * SSL settings for listeners and servers + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_SOCK_T_H +#define _HAPROXY_SSL_SOCK_T_H +#ifdef USE_OPENSSL + +#include <import/ebtree-t.h> + +#include <haproxy/buf-t.h> +#include <haproxy/connection-t.h> /* struct wait_event */ +#include <haproxy/listener-t.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/ssl_ckch-t.h> +#include <haproxy/ssl_crtlist-t.h> +#include <haproxy/thread-t.h> + +/* ***** READ THIS before adding code here! ***** + * + * Due to API incompatibilities between multiple OpenSSL versions and their + * derivatives, it's often tempting to add macros to (re-)define certain + * symbols. Please do not do this here, and do it in common/openssl-compat.h + * exclusively so that the whole code consistently uses the same macros. + * + * Whenever possible if a macro is missing in certain versions, it's better + * to conditionally define it in openssl-compat.h than using lots of ifdefs. + */ + +/* Warning, these are bits, not integers! */ +#define SSL_SOCK_ST_FL_VERIFY_DONE 0x00000001 +#define SSL_SOCK_ST_FL_16K_WBFSIZE 0x00000002 +#define SSL_SOCK_SEND_UNLIMITED 0x00000004 +#define SSL_SOCK_RECV_HEARTBEAT 0x00000008 +#define SSL_SOCK_SEND_MORE 0x00000010 /* set MSG_MORE at lower levels */ + +/* bits 0xFFFFFF00 are reserved to store verify errors. + * The CA en CRT error codes will be stored on 7 bits each + * (since the max verify error code does not exceed 127) + * and the CA error depth will be stored on 4 bits. + */ + +/* Verify errors macros */ +#define SSL_SOCK_CA_ERROR_TO_ST(e) (((e > 127) ? 127 : e) << (8)) +#define SSL_SOCK_CAEDEPTH_TO_ST(d) (((d > 15) ? 15 : d) << (7+8)) +#define SSL_SOCK_CRTERROR_TO_ST(e) (((e > 127) ? 127 : e) << (4+7+8)) + +#define SSL_SOCK_ST_TO_CA_ERROR(s) ((s >> (8)) & 127) +#define SSL_SOCK_ST_TO_CAEDEPTH(s) ((s >> (7+8)) & 15) +#define SSL_SOCK_ST_TO_CRTERROR(s) ((s >> (4+7+8)) & 127) + +/* ssl_methods flags for ssl options */ +#define MC_SSL_O_ALL 0x0000 +#define MC_SSL_O_NO_SSLV3 0x0001 /* disable SSLv3 */ +#define MC_SSL_O_NO_TLSV10 0x0002 /* disable TLSv10 */ +#define MC_SSL_O_NO_TLSV11 0x0004 /* disable TLSv11 */ +#define MC_SSL_O_NO_TLSV12 0x0008 /* disable TLSv12 */ +#define MC_SSL_O_NO_TLSV13 0x0010 /* disable TLSv13 */ + +/* file to guess during file loading */ +#define SSL_GF_NONE 0x00000000 /* Don't guess any file, only open the files specified in the configuration files */ +#define SSL_GF_BUNDLE 0x00000001 /* try to open the bundles */ +#define SSL_GF_SCTL 0x00000002 /* try to open the .sctl file */ +#define SSL_GF_OCSP 0x00000004 /* try to open the .ocsp file */ +#define SSL_GF_OCSP_ISSUER 0x00000008 /* try to open the .issuer file if an OCSP file was loaded */ +#define SSL_GF_KEY 0x00000010 /* try to open the .key file to load a private key */ + +#define SSL_GF_ALL (SSL_GF_BUNDLE|SSL_GF_SCTL|SSL_GF_OCSP|SSL_GF_OCSP_ISSUER|SSL_GF_KEY) + +/* ssl_methods versions */ +enum { + CONF_TLSV_NONE = 0, + CONF_TLSV_MIN = 1, + CONF_SSLV3 = 1, + CONF_TLSV10 = 2, + CONF_TLSV11 = 3, + CONF_TLSV12 = 4, + CONF_TLSV13 = 5, + CONF_TLSV_MAX = 5, +}; + +/* server and bind verify method, it uses a global value as default */ +enum { + SSL_SOCK_VERIFY_DEFAULT = 0, + SSL_SOCK_VERIFY_REQUIRED = 1, + SSL_SOCK_VERIFY_OPTIONAL = 2, + SSL_SOCK_VERIFY_NONE = 3, +}; + +/* bind ocsp update mode */ +enum { + SSL_SOCK_OCSP_UPDATE_DFLT = 0, + SSL_SOCK_OCSP_UPDATE_OFF = 1, + SSL_SOCK_OCSP_UPDATE_ON = 2, +}; + +/* states of the CLI IO handler for 'set ssl cert' */ +enum { + SETCERT_ST_INIT = 0, + SETCERT_ST_GEN, + SETCERT_ST_INSERT, + SETCERT_ST_FIN, +}; + +#if (HA_OPENSSL_VERSION_NUMBER < 0x1010000fL) +typedef enum { SET_CLIENT, SET_SERVER } set_context_func; +#else /* openssl >= 1.1.0 */ +typedef enum { SET_MIN, SET_MAX } set_context_func; +#endif + +struct methodVersions { + int option; + uint16_t flag; + void (*ctx_set_version)(SSL_CTX *, set_context_func); + void (*ssl_set_version)(SSL *, set_context_func); + const char *name; +}; + +struct pkey_info { + uint8_t sig; /* TLSEXT_signature_[rsa,ecdsa,...] */ + uint16_t bits; /* key size in bits */ +}; + +struct sni_ctx { + SSL_CTX *ctx; /* context associated to the certificate */ + int order; /* load order for the certificate */ + unsigned int neg:1; /* reject if match */ + unsigned int wild:1; /* wildcard sni */ + struct pkey_info kinfo; /* pkey info */ + struct ssl_bind_conf *conf; /* ptr to a crtlist's ssl_conf, must not be free from here */ + struct list by_ckch_inst; /* chained in ckch_inst's list of sni_ctx */ + struct ckch_inst *ckch_inst; /* instance used to create this sni_ctx */ + struct ebmb_node name; /* node holding the servername value */ +}; + +struct tls_sess_key_128 { + unsigned char name[16]; + unsigned char aes_key[16]; + unsigned char hmac_key[16]; +} __attribute__((packed)); + +struct tls_sess_key_256 { + unsigned char name[16]; + unsigned char aes_key[32]; + unsigned char hmac_key[32]; +} __attribute__((packed)); + +union tls_sess_key{ + unsigned char name[16]; + struct tls_sess_key_128 key_128; + struct tls_sess_key_256 key_256; +} __attribute__((packed)); + +struct tls_keys_ref { + struct list list; /* Used to chain refs. */ + char *filename; + int unique_id; /* Each pattern reference have unique id. */ + int refcount; /* number of users of this tls_keys_ref. */ + union tls_sess_key *tlskeys; + int tls_ticket_enc_index; + int key_size_bits; + __decl_thread(HA_RWLOCK_T lock); /* lock used to protect the ref */ +}; + +/* shared ssl session */ +struct sh_ssl_sess_hdr { + struct ebmb_node key; + unsigned char key_data[SSL_MAX_SSL_SESSION_ID_LENGTH]; +}; + +/* issuer chain store with hash of Subject Key Identifier + certificate/issuer matching is verify with X509_check_issued +*/ +struct issuer_chain { + struct eb64_node node; + STACK_OF(X509) *chain; + char *path; +}; + +struct connection; + +typedef void (*ssl_sock_msg_callback_func)(struct connection *conn, + int write_p, int version, int content_type, + const void *buf, size_t len, SSL *ssl); + +/* This structure contains a function pointer <func> that is called + * when observing received or sent SSL/TLS protocol messages, such as + * handshake messages or other events that can occur during processing. + */ +struct ssl_sock_msg_callback { + ssl_sock_msg_callback_func func; + struct list list; /* list of registered callbacks */ +}; + +/* This memory pool is used for capturing clienthello parameters. */ +struct ssl_capture { + ullong xxh64; + ushort protocol_version; + ushort ciphersuite_len; + ushort extensions_len; + ushort ec_len; + uint ciphersuite_offset; + uint extensions_offset; + uint ec_offset; + uint ec_formats_offset; + uchar ec_formats_len; + char data[VAR_ARRAY]; +}; + +#ifdef HAVE_SSL_KEYLOG +#define SSL_KEYLOG_MAX_SECRET_SIZE 129 + +struct ssl_keylog { + /* + * https://developer.mozilla.org/en-US/docs/Mozilla/Projects/NSS/Key_Log_Format + */ + char *client_random; + + /* TLS 1.3 */ + char *client_early_traffic_secret; + char *client_handshake_traffic_secret; + char *server_handshake_traffic_secret; + char *client_traffic_secret_0; + char *server_traffic_secret_0; + char *exporter_secret; + char *early_exporter_secret; +}; +#endif + +struct ssl_sock_ctx { + struct connection *conn; + SSL *ssl; + BIO *bio; + const struct xprt_ops *xprt; + void *xprt_ctx; + struct wait_event wait_event; + struct wait_event *subs; + int xprt_st; /* transport layer state, initialized to zero */ + unsigned long error_code; /* last error code of the error stack */ + struct buffer early_buf; /* buffer to store the early data received */ + int sent_early_data; /* Amount of early data we sent so far */ + +#ifdef USE_QUIC + struct quic_conn *qc; +#endif +}; + +struct global_ssl { + char *crt_base; /* base directory path for certificates */ + char *ca_base; /* base directory path for CAs and CRLs */ + char *issuers_chain_path; /* from "issuers-chain-path" */ + int skip_self_issued_ca; + + int async; /* whether we use ssl async mode */ + + char *listen_default_ciphers; + char *connect_default_ciphers; +#ifdef HAVE_SSL_CTX_SET_CIPHERSUITES + char *listen_default_ciphersuites; + char *connect_default_ciphersuites; +#endif +#if defined(SSL_CTX_set1_curves_list) + char *listen_default_curves; + char *connect_default_curves; +#endif +#if defined(SSL_CTX_set1_sigalgs_list) + char *listen_default_sigalgs; + char *connect_default_sigalgs; +#endif +#if defined(SSL_CTX_set1_sigalgs_list) + char *listen_default_client_sigalgs; + char *connect_default_client_sigalgs; +#endif + int listen_default_ssloptions; + int connect_default_ssloptions; + struct tls_version_filter listen_default_sslmethods; + struct tls_version_filter connect_default_sslmethods; + + int private_cache; /* Force to use a private session cache even if nbproc > 1 */ + unsigned int life_time; /* SSL session lifetime in seconds */ + unsigned int max_record; /* SSL max record size */ + unsigned int hard_max_record; /* SSL max record size hard limit */ + unsigned int default_dh_param; /* SSL maximum DH parameter size */ + int ctx_cache; /* max number of entries in the ssl_ctx cache. */ + int capture_buffer_size; /* Size of the capture buffer. */ + int keylog; /* activate keylog */ + int extra_files; /* which files not defined in the configuration file are we looking for */ + int extra_files_noext; /* whether we remove the extension when looking up a extra file */ + +#ifndef OPENSSL_NO_OCSP + struct { + unsigned int delay_max; + unsigned int delay_min; + } ocsp_update; +#endif +}; + +/* The order here matters for picking a default context, + * keep the most common keytype at the bottom of the list + */ +extern const char *SSL_SOCK_KEYTYPE_NAMES[]; + +#define SSL_SOCK_NUM_KEYTYPES 3 + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_SOCK_T_H */ diff --git a/include/haproxy/ssl_sock.h b/include/haproxy/ssl_sock.h new file mode 100644 index 0000000..02d5b02 --- /dev/null +++ b/include/haproxy/ssl_sock.h @@ -0,0 +1,191 @@ +/* + * include/haproxy/ssl_sock.h + * This file contains definition for ssl stream socket operations + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_SOCK_H +#define _HAPROXY_SSL_SOCK_H +#ifdef USE_OPENSSL + + +#include <haproxy/connection.h> +#include <haproxy/openssl-compat.h> +#include <haproxy/pool-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/ssl_sock-t.h> +#include <haproxy/thread.h> + +extern struct list tlskeys_reference; +extern struct eb_root ckchs_tree; +extern struct eb_root crtlists_tree; +extern struct eb_root cafile_tree; +extern int sctl_ex_index; +extern struct global_ssl global_ssl; +extern struct ssl_crtlist_kw ssl_crtlist_kws[]; +extern struct methodVersions methodVersions[]; +__decl_thread(extern HA_SPINLOCK_T ckch_lock); +extern struct pool_head *pool_head_ssl_capture; +extern int ssl_app_data_index; +#ifdef USE_QUIC +extern int ssl_qc_app_data_index; +#endif /* USE_QUIC */ +extern unsigned int openssl_engines_initialized; +extern int nb_engines; +extern struct xprt_ops ssl_sock; +extern int ssl_capture_ptr_index; +extern int ssl_keylog_index; +extern int ssl_client_sni_index; +extern struct pool_head *pool_head_ssl_keylog; +extern struct pool_head *pool_head_ssl_keylog_str; +extern struct list openssl_providers; + +int ssl_sock_prep_ctx_and_inst(struct bind_conf *bind_conf, struct ssl_bind_conf *ssl_conf, + SSL_CTX *ctx, struct ckch_inst *ckch_inst, char **err); +int ssl_sock_prep_srv_ctx_and_inst(const struct server *srv, SSL_CTX *ctx, + struct ckch_inst *ckch_inst); +int ssl_sock_prepare_all_ctx(struct bind_conf *bind_conf); +int ssl_sock_prepare_bind_conf(struct bind_conf *bind_conf); +void ssl_sock_destroy_bind_conf(struct bind_conf *bind_conf); +int ssl_sock_prepare_srv_ctx(struct server *srv); +void ssl_sock_free_srv_ctx(struct server *srv); +void ssl_sock_free_all_ctx(struct bind_conf *bind_conf); +int ssl_sock_get_alpn(const struct connection *conn, void *xprt_ctx, + const char **str, int *len); +int ssl_sock_load_ca(struct bind_conf *bind_conf); +void ssl_sock_free_ca(struct bind_conf *bind_conf); +int ssl_bio_and_sess_init(struct connection *conn, SSL_CTX *ssl_ctx, + SSL **ssl, BIO **bio, BIO_METHOD *bio_meth, void *ctx); +const char *ssl_sock_get_sni(struct connection *conn); +const char *ssl_sock_get_cert_sig(struct connection *conn); +const char *ssl_sock_get_cipher_name(struct connection *conn); +const char *ssl_sock_get_proto_version(struct connection *conn); +int ssl_sock_parse_alpn(char *arg, char **alpn_str, int *alpn_len, char **err); +void ssl_sock_set_alpn(struct connection *conn, const unsigned char *, int); +void ssl_sock_set_servername(struct connection *conn, const char *hostname); + +int ssl_sock_get_cert_used_sess(struct connection *conn); +int ssl_sock_get_cert_used_conn(struct connection *conn); +int ssl_sock_get_remote_common_name(struct connection *conn, + struct buffer *out); +int ssl_sock_get_pkey_algo(struct connection *conn, struct buffer *out); +unsigned int ssl_sock_get_verify_result(struct connection *conn); +#if (defined SSL_CTRL_SET_TLSEXT_TICKET_KEY_CB && TLS_TICKETS_NO > 0) +int ssl_sock_update_tlskey_ref(struct tls_keys_ref *ref, + struct buffer *tlskey); +int ssl_sock_update_tlskey(char *filename, struct buffer *tlskey, char **err); +struct tls_keys_ref *tlskeys_ref_lookup(const char *filename); +struct tls_keys_ref *tlskeys_ref_lookupid(int unique_id); +#endif +#ifndef OPENSSL_NO_DH +HASSL_DH *ssl_sock_get_dh_from_bio(BIO *bio); +int ssl_sock_load_global_dh_param_from_file(const char *filename); +void ssl_free_dh(void); +#endif +void ssl_free_engines(void); +#ifdef HAVE_SSL_PROVIDERS +void ssl_unload_providers(void); +#endif + +#ifdef HAVE_SSL_CLIENT_HELLO_CB +int ssl_sock_switchctx_err_cbk(SSL *ssl, int *al, void *priv); +# ifdef OPENSSL_IS_BORINGSSL +int ssl_sock_switchctx_cbk(const struct ssl_early_callback_ctx *ctx); +# else /* ! OPENSSL_IS_BORINGSSL */ +int ssl_sock_switchctx_cbk(SSL *ssl, int *al, void *arg); +# endif +#else /* ! HAVE_SSL_CLIENT_HELLO_CB */ +int ssl_sock_switchctx_cbk(SSL *ssl, int *al, void *priv); +#endif + +int increment_sslconn(); +SSL_CTX *ssl_sock_assign_generated_cert(unsigned int key, struct bind_conf *bind_conf, SSL *ssl); +SSL_CTX *ssl_sock_get_generated_cert(unsigned int key, struct bind_conf *bind_conf); +int ssl_sock_set_generated_cert(SSL_CTX *ctx, unsigned int key, struct bind_conf *bind_conf); +unsigned int ssl_sock_generated_cert_key(const void *data, size_t len); +void ssl_sock_load_cert_sni(struct ckch_inst *ckch_inst, struct bind_conf *bind_conf); +#ifdef SSL_MODE_ASYNC +void ssl_async_fd_handler(int fd); +void ssl_async_fd_free(int fd); +#endif +struct issuer_chain* ssl_get0_issuer_chain(X509 *cert); +int ssl_load_global_issuer_from_BIO(BIO *in, char *fp, char **err); +int ssl_sock_load_cert(char *path, struct bind_conf *bind_conf, char **err); +int ssl_sock_load_srv_cert(char *path, struct server *server, int create_if_none, char **err); +void ssl_free_global_issuers(void); +int ssl_initialize_random(void); +int ssl_sock_load_cert_list_file(char *file, int dir, struct bind_conf *bind_conf, struct proxy *curproxy, char **err); +int ssl_init_single_engine(const char *engine_id, const char *def_algorithms); +#ifdef HAVE_SSL_PROVIDERS +int ssl_init_provider(const char *provider_name); +#endif +#if ((defined SSL_CTRL_SET_TLSEXT_STATUS_REQ_CB && !defined OPENSSL_NO_OCSP) && !defined OPENSSL_IS_BORINGSSL) +int ssl_get_ocspresponse_detail(unsigned char *ocsp_certid, struct buffer *out); +int ssl_ocsp_response_print(struct buffer *ocsp_response, struct buffer *out); +#endif + +/* ssl shctx macro */ + +#define sh_ssl_sess_tree_delete(s) ebmb_delete(&(s)->key); + +#define sh_ssl_sess_tree_insert(s) (struct sh_ssl_sess_hdr *)ebmb_insert(sh_ssl_sess_tree, \ + &(s)->key, SSL_MAX_SSL_SESSION_ID_LENGTH); + +#define sh_ssl_sess_tree_lookup(k) (struct sh_ssl_sess_hdr *)ebmb_lookup(sh_ssl_sess_tree, \ + (k), SSL_MAX_SSL_SESSION_ID_LENGTH); + +/* Registers the function <func> in order to be called on SSL/TLS protocol + * message processing. + */ +int ssl_sock_register_msg_callback(ssl_sock_msg_callback_func func); + +SSL *ssl_sock_get_ssl_object(struct connection *conn); + +static inline int cert_ignerr_bitfield_get(const unsigned long long *bitfield, int bit_index) +{ + int byte_index = bit_index >> 6; + int val = 0; + + if (byte_index < IGNERR_BF_SIZE) + val = bitfield[byte_index] & (1ULL << (bit_index & 0x3F)); + + return val != 0; +} + +static inline void cert_ignerr_bitfield_set(unsigned long long *bitfield, int bit_index) +{ + int byte_index = bit_index >> 6; + + if (byte_index < IGNERR_BF_SIZE) + bitfield[byte_index] |= (1ULL << (bit_index & 0x3F)); +} + +static inline void cert_ignerr_bitfield_set_all(unsigned long long *bitfield) +{ + memset(bitfield, -1, IGNERR_BF_SIZE*sizeof(*bitfield)); +} + +#endif /* USE_OPENSSL */ +#endif /* _HAPROXY_SSL_SOCK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/ssl_utils.h b/include/haproxy/ssl_utils.h new file mode 100644 index 0000000..3391efd --- /dev/null +++ b/include/haproxy/ssl_utils.h @@ -0,0 +1,51 @@ +/* + * include/haproxy/ssl_utils.h + * + * Utility functions for SSL: + * Mostly generic functions that retrieve information from certificates + * + * Copyright (C) 2012 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2020 HAProxy Technologies, William Lallemand <wlallemand@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_SSL_UTILS_H +#define _HAPROXY_SSL_UTILS_H + +#ifdef USE_OPENSSL + +#include <haproxy/buf-t.h> +#include <haproxy/openssl-compat.h> + +int cert_get_pkey_algo(X509 *crt, struct buffer *out); +int ssl_sock_get_serial(X509 *crt, struct buffer *out); +int ssl_sock_crt2der(X509 *crt, struct buffer *out); +int ssl_sock_get_time(ASN1_TIME *tm, struct buffer *out); +int ssl_sock_get_dn_entry(X509_NAME *a, const struct buffer *entry, int pos, + struct buffer *out); +int ssl_sock_get_dn_formatted(X509_NAME *a, const struct buffer *format, struct buffer *out); +int ssl_sock_get_dn_oneline(X509_NAME *a, struct buffer *out); +X509* ssl_sock_get_peer_certificate(SSL *ssl); +X509* ssl_sock_get_verified_chain_root(SSL *ssl); +unsigned int openssl_version_parser(const char *version); +void exclude_tls_grease(char *input, int len, struct buffer *output); +int x509_v_err_str_to_int(const char *str); +const char *x509_v_err_int_to_str(int code); +long asn1_generalizedtime_to_epoch(ASN1_GENERALIZEDTIME *d); + +#endif /* _HAPROXY_SSL_UTILS_H */ +#endif /* USE_OPENSSL */ + diff --git a/include/haproxy/stats-t.h b/include/haproxy/stats-t.h new file mode 100644 index 0000000..34a4cc2 --- /dev/null +++ b/include/haproxy/stats-t.h @@ -0,0 +1,617 @@ +/* + * include/haproxy/stats-t.h + * This file provides structures and types for stats. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STATS_T_H +#define _HAPROXY_STATS_T_H + +#include <haproxy/api-t.h> + +/* Flags for applet.ctx.stats.flags */ +#define STAT_FMT_HTML 0x00000001 /* dump the stats in HTML format */ +#define STAT_FMT_TYPED 0x00000002 /* use the typed output format */ +#define STAT_FMT_JSON 0x00000004 /* dump the stats in JSON format */ +#define STAT_HIDE_DOWN 0x00000008 /* hide 'down' servers in the stats page */ +#define STAT_NO_REFRESH 0x00000010 /* do not automatically refresh the stats page */ +#define STAT_ADMIN 0x00000020 /* indicate a stats admin level */ +#define STAT_CHUNKED 0x00000040 /* use chunked encoding (HTTP/1.1) */ +#define STAT_JSON_SCHM 0x00000080 /* dump the json schema */ + +#define STAT_HIDEVER 0x00000100 /* conf: do not report the version and reldate */ +#define STAT_SHNODE 0x00000200 /* conf: show node name */ +#define STAT_SHDESC 0x00000400 /* conf: show description */ +#define STAT_SHLGNDS 0x00000800 /* conf: show legends */ +#define STAT_SHOW_FDESC 0x00001000 /* show the field descriptions when possible */ +#define STAT_SHMODULES 0x00002000 /* conf: show modules */ +#define STAT_HIDE_MAINT 0x00004000 /* hide maint/disabled servers */ +#define STAT_CONVDONE 0x00008000 /* conf: rules conversion done */ +#define STAT_USE_FLOAT 0x00010000 /* use floats where possible in the outputs */ + +#define STAT_BOUND 0x00800000 /* bound statistics to selected proxies/types/services */ +#define STAT_STARTED 0x01000000 /* some output has occurred */ + +#define STAT_FMT_MASK 0x00000007 + +#define STATS_TYPE_FE 0 +#define STATS_TYPE_BE 1 +#define STATS_TYPE_SV 2 +#define STATS_TYPE_SO 3 + +#define STATS_DOMAIN (0) /* used for bitshifting, type of statistics: proxy or dns */ +#define STATS_PX_CAP (8) /* used for bitshifting, differentiate obj1 type for proxy statistics */ + +/* HTTP stats : applet.st0 */ +enum { + STAT_HTTP_INIT = 0, /* Initial state */ + STAT_HTTP_HEAD, /* send headers before dump */ + STAT_HTTP_DUMP, /* dumping stats */ + STAT_HTTP_POST, /* waiting post data */ + STAT_HTTP_LAST, /* sending last chunk of response */ + STAT_HTTP_DONE, /* dump is finished */ + STAT_HTTP_END, /* finished */ +}; + +/* status codes available for the stats admin page */ +enum { + STAT_STATUS_INIT = 0, + STAT_STATUS_DENY, /* action denied */ + STAT_STATUS_DONE, /* the action is successful */ + STAT_STATUS_ERRP, /* an error occurred due to invalid values in parameters */ + STAT_STATUS_EXCD, /* an error occurred because the buffer couldn't store all data */ + STAT_STATUS_NONE, /* nothing happened (no action chosen or servers state didn't change) */ + STAT_STATUS_PART, /* the action is partially successful */ + STAT_STATUS_UNKN, /* an unknown error occurred, shouldn't happen */ + STAT_STATUS_IVAL, /* invalid requests (chunked or invalid post) */ + STAT_STATUS_SIZE +}; + +/* HTML form to limit output scope */ +#define STAT_SCOPE_TXT_MAXLEN 20 /* max len for scope substring */ +#define STAT_SCOPE_INPUT_NAME "scope" /* pattern form scope name <input> in html form */ +#define STAT_SCOPE_PATTERN "?" STAT_SCOPE_INPUT_NAME "=" + +/* Actions available for the stats admin forms */ +enum { + ST_ADM_ACTION_NONE = 0, + + /* enable/disable health checks */ + ST_ADM_ACTION_DHLTH, + ST_ADM_ACTION_EHLTH, + + /* force health check status */ + ST_ADM_ACTION_HRUNN, + ST_ADM_ACTION_HNOLB, + ST_ADM_ACTION_HDOWN, + + /* enable/disable agent checks */ + ST_ADM_ACTION_DAGENT, + ST_ADM_ACTION_EAGENT, + + /* force agent check status */ + ST_ADM_ACTION_ARUNN, + ST_ADM_ACTION_ADOWN, + + /* set admin state */ + ST_ADM_ACTION_READY, + ST_ADM_ACTION_DRAIN, + ST_ADM_ACTION_MAINT, + ST_ADM_ACTION_SHUTDOWN, + /* these are the ancient actions, still available for compatibility */ + ST_ADM_ACTION_DISABLE, + ST_ADM_ACTION_ENABLE, + ST_ADM_ACTION_STOP, + ST_ADM_ACTION_START, +}; + + +/* data transmission states for the stats responses */ +enum stat_state { + STAT_STATE_INIT = 0, + STAT_STATE_HEAD, + STAT_STATE_INFO, + STAT_STATE_LIST, + STAT_STATE_END, + STAT_STATE_FIN, +}; + +/* kept in 2.6 only for compatibility with legacy code. Will be removed in 2.7, + * please do not use these values anymore and defined your own! + */ +enum obsolete_stat_state { + STAT_ST_INIT ENUM_ATTRIBUTE((deprecated)) = 0, + STAT_ST_HEAD ENUM_ATTRIBUTE((deprecated)), + STAT_ST_INFO ENUM_ATTRIBUTE((deprecated)), + STAT_ST_LIST ENUM_ATTRIBUTE((deprecated)), + STAT_ST_END ENUM_ATTRIBUTE((deprecated)), + STAT_ST_FIN ENUM_ATTRIBUTE((deprecated)), +}; + +/* data transmission states for the stats responses inside a proxy */ +enum { + STAT_PX_ST_INIT = 0, + STAT_PX_ST_TH, + STAT_PX_ST_FE, + STAT_PX_ST_LI, + STAT_PX_ST_SV, + STAT_PX_ST_BE, + STAT_PX_ST_END, + STAT_PX_ST_FIN, +}; + +/* This level of detail is needed to let the stats consumer know how to + * aggregate them (eg: between processes or cluster nodes). Only a few + * combinations are actually in use, though the mechanism tends to make + * this easy to extend to future uses. + * + * Each reported stats element is typed based on 4 dimensions : + * - the field format : it indicates the validity range of the reported value, + * its limits and how to parse it. 6 types are currently supported : + * empty, signed 32-bit integer, unsigned 32-bit integer, signed 64-bit + * integer, unsigned 64-bit integer, string + * + * - the field origin : how was the value retrieved and what it depends on. + * 5 origins are currently defined : product (eg: haproxy version or + * release date), configuration (eg: a configured limit), key (identifier + * used to group values at a certain level), metric (a measure of something), + * status (something discrete which by definition cannot be averaged nor + * aggregated, such as "listening" versus "full"). + * + * - the field nature : what does the data represent, implying how to aggregate + * it. At least 9 different natures are expected : counter (an increasing + * positive counter that may wrap when its type is overflown such as a byte + * counter), gauge (a measure at any instant that may vary, such as a + * concurrent connection count), a limit (eg: maximum acceptable concurrent + * connections), a minimum (eg: minimum free memory over a period), a + * maximum (eg: highest queue length over a period), an event rate (eg: + * incoming connections per second), a duration that is often aggregated by + * taking the max (eg: service uptime), an age that generally reports the + * last time an event appeared and which generally is aggregated by taking + * the most recent event hence the smallest one, the time which reports a + * discrete instant and cannot obviously be averaged either, a name which + * will generally be the name of an entity (such as a server name or cookie + * name), an output which is mostly used for various unsafe strings that are + * retrieved (eg: last check output, product name, description, etc), and an + * average which indicates that the value is relative and meant to be averaged + * between all nodes (eg: response time, throttling, etc). + * + * - the field scope : if the value is shared with other elements, which ones + * are expected to report the same value. The first scope with the least + * share is the process (most common one) where all data are only relevant + * to the process being consulted. The next one is the service, which is + * valid for all processes launched together (eg: shared SSL cache usage + * among processes). The next one is the system (such as the OS version) + * and which will report the same information for all instances running on + * the same node. The next one is the cluster, which indicates that the + * information are shared with other nodes being part of a same cluster. + * Stick-tables may carry such cluster-wide information. Larger scopes may + * be added in the future such as datacenter, country, continent, planet, + * galaxy, universe, etc. + * + * All these information will be encoded in the field as a bit field so that + * it is easy to pass composite values by simply ORing elements above, and + * to ease the definition of a few field types for the most common field + * combinations. + * + * The enums try to be arranged so that most likely characteristics are + * assigned the value zero, making it easier to add new fields. + * + * Field format has precedence over the other parts of the type. Please avoid + * declaring extra formats unless absolutely needed. The first one, FF_EMPTY, + * must absolutely have value zero so that it is what is returned after a + * memset(0). Furthermore, the producer is responsible for ensuring that when + * this format is set, all other bits of the type as well as the values in the + * union only contain zeroes. This makes it easier for the consumer to use the + * values as the expected type. + */ + +enum field_format { + FF_EMPTY = 0x00000000, + FF_S32 = 0x00000001, + FF_U32 = 0x00000002, + FF_S64 = 0x00000003, + FF_U64 = 0x00000004, + FF_STR = 0x00000005, + FF_FLT = 0x00000006, + FF_MASK = 0x000000FF, +}; + +enum field_origin { + FO_METRIC = 0x00000000, + FO_STATUS = 0x00000100, + FO_KEY = 0x00000200, + FO_CONFIG = 0x00000300, + FO_PRODUCT = 0x00000400, + FO_MASK = 0x0000FF00, +}; + +enum field_nature { + FN_GAUGE = 0x00000000, + FN_LIMIT = 0x00010000, + FN_MIN = 0x00020000, + FN_MAX = 0x00030000, + FN_RATE = 0x00040000, + FN_COUNTER = 0x00050000, + FN_DURATION = 0x00060000, + FN_AGE = 0x00070000, + FN_TIME = 0x00080000, + FN_NAME = 0x00090000, + FN_OUTPUT = 0x000A0000, + FN_AVG = 0x000B0000, + FN_MASK = 0x00FF0000, +}; + +enum field_scope { + FS_PROCESS = 0x00000000, + FS_SERVICE = 0x01000000, + FS_SYSTEM = 0x02000000, + FS_CLUSTER = 0x03000000, + FS_MASK = 0xFF000000, +}; + +/* Show info fields for CLI output. For any field added here, please add the + * text representation in the info_fields array. Please only append at the end, + * before the INF_TOTAL_FIELDS entry, and never insert anything in the middle + * nor at the beginning. + */ +enum info_field { + INF_NAME, + INF_VERSION, + INF_RELEASE_DATE, + INF_NBTHREAD, + INF_NBPROC, + INF_PROCESS_NUM, + INF_PID, + INF_UPTIME, + INF_UPTIME_SEC, + INF_MEMMAX_MB, + INF_POOL_ALLOC_MB, + INF_POOL_USED_MB, + INF_POOL_FAILED, + INF_ULIMIT_N, + INF_MAXSOCK, + INF_MAXCONN, + INF_HARD_MAXCONN, + INF_CURR_CONN, + INF_CUM_CONN, + INF_CUM_REQ, + INF_MAX_SSL_CONNS, + INF_CURR_SSL_CONNS, + INF_CUM_SSL_CONNS, + INF_MAXPIPES, + INF_PIPES_USED, + INF_PIPES_FREE, + INF_CONN_RATE, + INF_CONN_RATE_LIMIT, + INF_MAX_CONN_RATE, + INF_SESS_RATE, + INF_SESS_RATE_LIMIT, + INF_MAX_SESS_RATE, + INF_SSL_RATE, + INF_SSL_RATE_LIMIT, + INF_MAX_SSL_RATE, + INF_SSL_FRONTEND_KEY_RATE, + INF_SSL_FRONTEND_MAX_KEY_RATE, + INF_SSL_FRONTEND_SESSION_REUSE_PCT, + INF_SSL_BACKEND_KEY_RATE, + INF_SSL_BACKEND_MAX_KEY_RATE, + INF_SSL_CACHE_LOOKUPS, + INF_SSL_CACHE_MISSES, + INF_COMPRESS_BPS_IN, + INF_COMPRESS_BPS_OUT, + INF_COMPRESS_BPS_RATE_LIM, + INF_ZLIB_MEM_USAGE, + INF_MAX_ZLIB_MEM_USAGE, + INF_TASKS, + INF_RUN_QUEUE, + INF_IDLE_PCT, + INF_NODE, + INF_DESCRIPTION, + INF_STOPPING, + INF_JOBS, + INF_UNSTOPPABLE_JOBS, + INF_LISTENERS, + INF_ACTIVE_PEERS, + INF_CONNECTED_PEERS, + INF_DROPPED_LOGS, + INF_BUSY_POLLING, + INF_FAILED_RESOLUTIONS, + INF_TOTAL_BYTES_OUT, + INF_TOTAL_SPLICED_BYTES_OUT, + INF_BYTES_OUT_RATE, + INF_DEBUG_COMMANDS_ISSUED, + INF_CUM_LOG_MSGS, + INF_BUILD_INFO, + INF_MEMMAX_BYTES, + INF_POOL_ALLOC_BYTES, + INF_POOL_USED_BYTES, + INF_START_TIME_SEC, + INF_TAINTED, + INF_WARNINGS, + INF_MAXCONN_REACHED, + INF_BOOTTIME_MS, + INF_NICED_TASKS, + + /* must always be the last one */ + INF_TOTAL_FIELDS +}; + + +/* Stats fields for CSV output. For any field added here, please add the text + * representation in the stat_fields array. Please only append at the end, + * before the ST_F_TOTAL_FIELDS entry, and never insert anything in the middle + * nor at the beginning.When adding an entry here, one must always add a + * corresponding one in stat_fields[] otherwise Lua's get_stats() will break, + * and "show stats" will show a null. + */ +enum stat_field { + ST_F_PXNAME, + ST_F_SVNAME, + ST_F_QCUR, + ST_F_QMAX, + ST_F_SCUR, + ST_F_SMAX, + ST_F_SLIM, + ST_F_STOT, + ST_F_BIN , + ST_F_BOUT, + ST_F_DREQ, + ST_F_DRESP, + ST_F_EREQ, + ST_F_ECON, + ST_F_ERESP, + ST_F_WRETR, + ST_F_WREDIS, + ST_F_STATUS, + ST_F_WEIGHT, + ST_F_ACT, + ST_F_BCK, + ST_F_CHKFAIL, + ST_F_CHKDOWN, + ST_F_LASTCHG, + ST_F_DOWNTIME, + ST_F_QLIMIT, + ST_F_PID, + ST_F_IID, + ST_F_SID, + ST_F_THROTTLE, + ST_F_LBTOT, + ST_F_TRACKED, + ST_F_TYPE, + ST_F_RATE, + ST_F_RATE_LIM, + ST_F_RATE_MAX, + ST_F_CHECK_STATUS, + ST_F_CHECK_CODE, + ST_F_CHECK_DURATION, + ST_F_HRSP_1XX, + ST_F_HRSP_2XX, + ST_F_HRSP_3XX, + ST_F_HRSP_4XX, + ST_F_HRSP_5XX, + ST_F_HRSP_OTHER, + ST_F_HANAFAIL, + ST_F_REQ_RATE, + ST_F_REQ_RATE_MAX, + ST_F_REQ_TOT, + ST_F_CLI_ABRT, + ST_F_SRV_ABRT, + ST_F_COMP_IN, + ST_F_COMP_OUT, + ST_F_COMP_BYP, + ST_F_COMP_RSP, + ST_F_LASTSESS, + ST_F_LAST_CHK, + ST_F_LAST_AGT, + ST_F_QTIME, + ST_F_CTIME, + ST_F_RTIME, + ST_F_TTIME, + ST_F_AGENT_STATUS, + ST_F_AGENT_CODE, + ST_F_AGENT_DURATION, + ST_F_CHECK_DESC, + ST_F_AGENT_DESC, + ST_F_CHECK_RISE, + ST_F_CHECK_FALL, + ST_F_CHECK_HEALTH, + ST_F_AGENT_RISE, + ST_F_AGENT_FALL, + ST_F_AGENT_HEALTH, + ST_F_ADDR, + ST_F_COOKIE, + ST_F_MODE, + ST_F_ALGO, + ST_F_CONN_RATE, + ST_F_CONN_RATE_MAX, + ST_F_CONN_TOT, + ST_F_INTERCEPTED, + ST_F_DCON, + ST_F_DSES, + ST_F_WREW, + ST_F_CONNECT, + ST_F_REUSE, + ST_F_CACHE_LOOKUPS, + ST_F_CACHE_HITS, + ST_F_SRV_ICUR, + ST_F_SRV_ILIM, + ST_F_QT_MAX, + ST_F_CT_MAX, + ST_F_RT_MAX, + ST_F_TT_MAX, + ST_F_EINT, + ST_F_IDLE_CONN_CUR, + ST_F_SAFE_CONN_CUR, + ST_F_USED_CONN_CUR, + ST_F_NEED_CONN_EST, + ST_F_UWEIGHT, + ST_F_AGG_SRV_STATUS, + ST_F_AGG_SRV_CHECK_STATUS, + ST_F_AGG_CHECK_STATUS, + ST_F_SRID, + ST_F_SESS_OTHER, + ST_F_H1SESS, + ST_F_H2SESS, + ST_F_H3SESS, + ST_F_REQ_OTHER, + ST_F_H1REQ, + ST_F_H2REQ, + ST_F_H3REQ, + ST_F_PROTO, + + /* must always be the last one */ + ST_F_TOTAL_FIELDS +}; + +/* Please consider updating stats_dump_fields_*(), + * stats_dump_.*_info_fields() and stats_*_schema() + * when modifying struct field or related enums. + */ +struct field { + uint32_t type; + union { + int32_t s32; /* FF_S32 */ + uint32_t u32; /* FF_U32 */ + int64_t s64; /* FF_S64 */ + uint64_t u64; /* FF_U64 */ + double flt; /* FF_FLT */ + const char *str; /* FF_STR */ + } u; +}; + +enum counters_type { + COUNTERS_FE = 0, + COUNTERS_BE, + COUNTERS_SV, + COUNTERS_LI, + COUNTERS_RSLV, + + COUNTERS_OFF_END +}; + +/* Entity used to generate statistics on an HAProxy component */ +struct stats_module { + struct list list; + const char *name; + + /* functor used to generate the stats module using counters provided through data parameter */ + void (*fill_stats)(void *data, struct field *); + + struct name_desc *stats; /* name/description of stats provided by the module */ + void *counters; /* initial values of allocated counters */ + size_t counters_off[COUNTERS_OFF_END]; /* list of offsets of allocated counters in various objects */ + size_t stats_count; /* count of stats provided */ + size_t counters_size; /* sizeof counters */ + + uint32_t domain_flags; /* stats application domain for this module */ + char clearable; /* reset on a clear counters */ +}; + +struct extra_counters { + char *data; /* heap containing counters allocated in a linear fashion */ + size_t size; /* size of allocated data */ + enum counters_type type; /* type of object containing the counters */ +}; + +/* stats_domain is used in a flag as a 1 byte field */ +enum stats_domain { + STATS_DOMAIN_PROXY = 0, + STATS_DOMAIN_RESOLVERS, + STATS_DOMAIN_COUNT, + + STATS_DOMAIN_MASK = 0xff +}; + +/* used in a flag as a 1 byte field */ +enum stats_domain_px_cap { + STATS_PX_CAP_FE = 0x01, + STATS_PX_CAP_BE = 0x02, + STATS_PX_CAP_SRV = 0x04, + STATS_PX_CAP_LI = 0x08, + + STATS_PX_CAP_MASK = 0xff +}; + +/* the context of a "show stat" command in progress on the CLI or the stats applet */ +struct show_stat_ctx { + struct proxy *http_px; /* parent proxy of the current applet (only relevant for HTTP applet) */ + void *obj1; /* context pointer used in stats dump */ + void *obj2; /* context pointer used in stats dump */ + uint32_t domain; /* set the stats to used, for now only proxy stats are supported */ + int scope_str; /* limit scope to a frontend/backend substring */ + int scope_len; /* length of the string above in the buffer */ + int field; /* current field iterator when stat line is dumped through returning function */ + int px_st; /* STAT_PX_ST* */ + unsigned int flags; /* STAT_* from stats-t.h */ + int iid, type, sid; /* proxy id, type and service id if bounding of stats is enabled */ + int st_code; /* the status code returned by an action */ + enum stat_state state; /* phase of output production */ +}; + +extern THREAD_LOCAL void *trash_counters; + +#define EXTRA_COUNTERS(name) \ + struct extra_counters *name + +#define EXTRA_COUNTERS_GET(counters, mod) \ + (likely(counters) ? \ + ((void *)((counters)->data + (mod)->counters_off[(counters)->type])) : \ + (trash_counters)) + +#define EXTRA_COUNTERS_REGISTER(counters, ctype, alloc_failed_label) \ + do { \ + typeof(*counters) _ctr; \ + _ctr = calloc(1, sizeof(*_ctr)); \ + if (!_ctr) \ + goto alloc_failed_label; \ + _ctr->type = (ctype); \ + *(counters) = _ctr; \ + } while (0) + +#define EXTRA_COUNTERS_ADD(mod, counters, new_counters, csize) \ + do { \ + typeof(counters) _ctr = (counters); \ + (mod)->counters_off[_ctr->type] = _ctr->size; \ + _ctr->size += (csize); \ + } while (0) + +#define EXTRA_COUNTERS_ALLOC(counters, alloc_failed_label) \ + do { \ + typeof(counters) _ctr = (counters); \ + _ctr->data = malloc((_ctr)->size); \ + if (!_ctr->data) \ + goto alloc_failed_label; \ + } while (0) + +#define EXTRA_COUNTERS_INIT(counters, mod, init_counters, init_counters_size) \ + do { \ + typeof(counters) _ctr = (counters); \ + memcpy(_ctr->data + mod->counters_off[_ctr->type], \ + (init_counters), (init_counters_size)); \ + } while (0) + +#define EXTRA_COUNTERS_FREE(counters) \ + do { \ + if (counters) { \ + free((counters)->data); \ + free(counters); \ + } \ + } while (0) + +#endif /* _HAPROXY_STATS_T_H */ diff --git a/include/haproxy/stats.h b/include/haproxy/stats.h new file mode 100644 index 0000000..f9e6d97 --- /dev/null +++ b/include/haproxy/stats.h @@ -0,0 +1,145 @@ +/* + * include/haproxy/stats.h + * This file contains definitions of some primitives to dedicated to + * statistics output. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STATS_H +#define _HAPROXY_STATS_H + +#include <haproxy/api.h> +#include <haproxy/listener-t.h> +#include <haproxy/stats-t.h> +#include <haproxy/tools-t.h> + +struct channel; +struct buffer; +struct proxy; +struct appctx; +struct htx; + +/* These two structs contains all field names and descriptions according to + * the the number of entries in "enum stat_field" and "enum info_field" + */ +extern const struct name_desc stat_fields[]; +extern const struct name_desc info_fields[]; +extern const char *stat_status_codes[]; +extern struct applet http_stats_applet; +extern THREAD_LOCAL struct field info[]; +extern THREAD_LOCAL struct field *stat_l[]; + +struct htx; +int stats_putchk(struct appctx *appctx, struct htx *htx); + +int stats_dump_one_line(const struct field *stats, size_t stats_count, struct appctx *appctx); + +int stats_fill_info(struct field *info, int len, uint flags); +int stats_fill_fe_stats(struct proxy *px, struct field *stats, int len, + enum stat_field *selected_field); +int stats_fill_li_stats(struct proxy *px, struct listener *l, int flags, + struct field *stats, int len, enum stat_field *selected_field); +int stats_fill_sv_stats(struct proxy *px, struct server *sv, int flags, + struct field *stats, int len, enum stat_field *selected_field); +int stats_fill_be_stats(struct proxy *px, int flags, struct field *stats, int len, + enum stat_field *selected_field); + +int stats_emit_raw_data_field(struct buffer *out, const struct field *f); +int stats_emit_typed_data_field(struct buffer *out, const struct field *f); +int stats_emit_field_tags(struct buffer *out, const struct field *f, + char delim); + + +static inline enum field_format field_format(const struct field *f, int e) +{ + return f[e].type & FF_MASK; +} + +static inline enum field_origin field_origin(const struct field *f, int e) +{ + return f[e].type & FO_MASK; +} + +static inline enum field_scope field_scope(const struct field *f, int e) +{ + return f[e].type & FS_MASK; +} + +static inline enum field_nature field_nature(const struct field *f, int e) +{ + return f[e].type & FN_MASK; +} + +static inline const char *field_str(const struct field *f, int e) +{ + return (field_format(f, e) == FF_STR && f[e].u.str) ? f[e].u.str : ""; +} + +static inline struct field mkf_s32(uint32_t type, int32_t value) +{ + struct field f = { .type = FF_S32 | type, .u.s32 = value }; + return f; +} + +static inline struct field mkf_u32(uint32_t type, uint32_t value) +{ + struct field f = { .type = FF_U32 | type, .u.u32 = value }; + return f; +} + +static inline struct field mkf_s64(uint32_t type, int64_t value) +{ + struct field f = { .type = FF_S64 | type, .u.s64 = value }; + return f; +} + +static inline struct field mkf_u64(uint32_t type, uint64_t value) +{ + struct field f = { .type = FF_U64 | type, .u.u64 = value }; + return f; +} + +static inline struct field mkf_str(uint32_t type, const char *value) +{ + struct field f = { .type = FF_STR | type, .u.str = value }; + return f; +} + +static inline struct field mkf_flt(uint32_t type, double value) +{ + struct field f = { .type = FF_FLT | type, .u.flt = value }; + return f; +} + +#define MK_STATS_PROXY_DOMAIN(px_cap) \ + ((px_cap) << STATS_PX_CAP | STATS_DOMAIN_PROXY) + +int stats_allocate_proxy_counters_internal(struct extra_counters **counters, + int type, int px_cap); +int stats_allocate_proxy_counters(struct proxy *px); + +void stats_register_module(struct stats_module *m); + +#endif /* _HAPROXY_STATS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/stconn-t.h b/include/haproxy/stconn-t.h new file mode 100644 index 0000000..63bcb79 --- /dev/null +++ b/include/haproxy/stconn-t.h @@ -0,0 +1,325 @@ +/* + * include/haproxy/stconn-t.h + * This file describes the stream connector struct and associated constants. + * + * Copyright 2021 Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STCONN_T_H +#define _HAPROXY_STCONN_T_H + +#include <haproxy/obj_type-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/pipe-t.h> +#include <haproxy/show_flags-t.h> +#include <haproxy/xref-t.h> + +enum iobuf_flags { + IOBUF_FL_NONE = 0x00000000, /* For initialization purposes */ + IOBUF_FL_NO_FF = 0x00000001, /* Fast-forwarding is not supported */ + IOBUF_FL_NO_SPLICING = 0x00000002, /* Splicing is not supported or unusable for this stream */ + IOBUF_FL_FF_BLOCKED = 0x00000004, /* Fast-forwarding is blocked (buffer allocation/full) */ + + IOBUF_FL_INTERIM_FF = 0x00000008, /* Producer side warn it will immediately retry a fast-forward. + * .done_fastfwd() on consumer side must take care of this flag + */ + IOBUF_FL_EOI = 0x00000010, /* A EOI was encountered on producer side */ +}; + +struct iobuf { + struct pipe *pipe; /* non-NULL only when data present */ + struct buffer *buf; + size_t offset; + size_t data; + unsigned int flags; +}; + +/* Stream Endpoint Flags. + * Please also update the se_show_flags() function below in case of changes. + */ +enum se_flags { + SE_FL_NONE = 0x00000000, /* For initialization purposes */ + + /* Endpoint types */ + SE_FL_T_MUX = 0x00000001, /* The endpoint is a mux (the target may be NULL before the mux init) */ + SE_FL_T_APPLET = 0x00000002, /* The endpoint is an applet */ + + /* unused: 0x00000004 .. 0x00000008 */ + + /* Endpoint states: none == attached to a mux with a stream connector */ + SE_FL_DETACHED = 0x00000010, /* The endpoint is detached (no mux/no applet) */ + SE_FL_ORPHAN = 0x00000020, /* The endpoint is orphan (no stream connector) */ + + /* unused: 0x00000040 .. 0x00000080 */ + + SE_FL_SHRD = 0x00000100, /* read shut, draining extra data */ + SE_FL_SHRR = 0x00000200, /* read shut, resetting extra data */ + SE_FL_SHR = SE_FL_SHRD | SE_FL_SHRR, /* read shut status */ + + SE_FL_SHWN = 0x00000400, /* write shut, verbose mode */ + SE_FL_SHWS = 0x00000800, /* write shut, silent mode */ + SE_FL_SHW = SE_FL_SHWN | SE_FL_SHWS, /* write shut status */ + + /* following flags are supposed to be set by the endpoint and read by + * the app layer : + */ + + /* Permanent flags */ + SE_FL_NOT_FIRST = 0x00001000, /* This stream connector is not the first one for the endpoint */ + SE_FL_WEBSOCKET = 0x00002000, /* The endpoint uses the websocket proto */ + SE_FL_EOI = 0x00004000, /* end-of-input reached */ + SE_FL_EOS = 0x00008000, /* End of stream delivered to data layer */ + SE_FL_ERROR = 0x00010000, /* a fatal error was reported */ + /* Transient flags */ + SE_FL_ERR_PENDING= 0x00020000, /* An error is pending, but there's still data to be read */ + SE_FL_RCV_MORE = 0x00040000, /* Endpoint may have more bytes to transfer */ + SE_FL_WANT_ROOM = 0x00080000, /* More bytes to transfer, but not enough room */ + SE_FL_EXP_NO_DATA= 0x00100000, /* No data expected by the endpoint */ + SE_FL_MAY_FASTFWD_PROD = 0x00200000, /* The endpoint may produce data via zero-copy forwarding */ + SE_FL_MAY_FASTFWD_CONS = 0x00400000, /* The endpoint may consume data via zero-copy forwarding */ + SE_FL_ENDP_MASK = 0x004ff000, /* Mask for flags set by the endpoint */ + + /* following flags are supposed to be set by the app layer and read by + * the endpoint : + */ + /* unused 0x00800000,*/ + /* unused 0x01000000,*/ + /* unused 0x02000000,*/ + SE_FL_WAIT_FOR_HS = 0x04000000, /* This stream is waiting for handhskae */ + SE_FL_KILL_CONN = 0x08000000, /* must kill the connection when the SC closes */ + SE_FL_WAIT_DATA = 0x10000000, /* stream endpoint cannot work without more data from the stream's output */ + SE_FL_WONT_CONSUME = 0x20000000, /* stream endpoint will not consume more data */ + SE_FL_HAVE_NO_DATA = 0x40000000, /* the endpoint has no more data to deliver to the stream */ + SE_FL_APPLET_NEED_CONN = 0x80000000, /* applet is waiting for the other side to (fail to) connect */ +}; + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *se_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(SE_FL_T_MUX, _(SE_FL_T_APPLET, _(SE_FL_DETACHED, _(SE_FL_ORPHAN, + _(SE_FL_SHRD, _(SE_FL_SHRR, _(SE_FL_SHWN, _(SE_FL_SHWS, + _(SE_FL_NOT_FIRST, _(SE_FL_WEBSOCKET, _(SE_FL_EOI, _(SE_FL_EOS, + _(SE_FL_ERROR, _(SE_FL_ERR_PENDING, _(SE_FL_RCV_MORE, + _(SE_FL_WANT_ROOM, _(SE_FL_EXP_NO_DATA, _(SE_FL_MAY_FASTFWD_PROD, _(SE_FL_MAY_FASTFWD_CONS, + _(SE_FL_WAIT_FOR_HS, _(SE_FL_KILL_CONN, _(SE_FL_WAIT_DATA, + _(SE_FL_WONT_CONSUME, _(SE_FL_HAVE_NO_DATA, _(SE_FL_APPLET_NEED_CONN))))))))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* stconn flags. + * Please also update the sc_show_flags() function below in case of changes. + * + * When SC_FL_ABRT_WANTED/SC_FL_EOS is set, it is strictly forbidden for the + * producer to alter the buffer contents. In this case, the consumer is free to + * perform a shutdown when it has consumed the last contents, otherwise the + * session processor will do it anyway. SC_FL_ABRT* are set at the upper layer + * level (the stream) while SC_FL_EOS is set at the SE layer. + * + * The SC_FL_SHUT_WANTED flaga should be set by the session processor when + * SC_FLABRT_DONE/SC_FL_EOS and CF_AUTO_CLOSE are both set. And it may also be + * set by the producer when it detects SC_FL_EOS while directly forwarding data to the + * consumer. + * + * The SHUT/ABRT flags work like this : + * + * ABRT_WANTED ABRT_DONE meaning + * 0 0 normal case, connection still open and data is being read + * 1 0 closing : the producer cannot feed data anymore but can close + * 0/1 1 closed: the producer has closed its input channel. + * + * SHUT_WANTED SHUT_DONE meaning + * 0 0 normal case, connection still open and data is being written + * 1 0 closing: the consumer can send last data and may then close + * 0/1 1 closed: the consumer has closed its output channel. + * + * + * The ABRT_WANTED flag is mostly used to force the producer to abort when an error is + * detected on the consumer side. + * + */ +enum sc_flags { + SC_FL_NONE = 0x00000000, /* Just for initialization purposes */ + SC_FL_ISBACK = 0x00000001, /* Set for SC on back-side */ + + SC_FL_EOI = 0x00000002, /* End of input was reached. no more data will be received from the endpoint */ + SC_FL_ERROR = 0x00000004, /* A fatal error was reported */ + + SC_FL_NOLINGER = 0x00000008, /* may close without lingering. One-shot. */ + SC_FL_NOHALF = 0x00000010, /* no half close, close both sides at once */ + SC_FL_DONT_WAKE = 0x00000020, /* resync in progress, don't wake up */ + SC_FL_INDEP_STR = 0x00000040, /* independent streams = don't update rex on write */ + + SC_FL_WONT_READ = 0x00000080, /* SC doesn't want to read data */ + SC_FL_NEED_BUFF = 0x00000100, /* SC waits for an rx buffer allocation to complete */ + SC_FL_NEED_ROOM = 0x00000200, /* SC needs more room in the rx buffer to store incoming data */ + + SC_FL_RCV_ONCE = 0x00000400, /* Don't loop to receive data. cleared after a successful receive */ + SC_FL_SND_ASAP = 0x00000800, /* Don't wait for sending. cleared when all data were sent */ + SC_FL_SND_NEVERWAIT = 0x00001000, /* Never wait for sending (permanent) */ + SC_FL_SND_EXP_MORE = 0x00002000, /* More data expected to be sent very soon. cleared when all data were sent */ + + SC_FL_ABRT_WANTED = 0x00004000, /* An abort was requested and must be performed ASAP (up side to down side) */ + SC_FL_SHUT_WANTED = 0x00008000, /* A shutdown was requested and mux be performed ASAP (up side to down side) */ + SC_FL_ABRT_DONE = 0x00010000, /* An abort was performed for the SC */ + SC_FL_SHUT_DONE = 0x00020000, /* A shutdown was performed for the SC */ + + SC_FL_EOS = 0x00040000, /* End of stream was reached (from down side to up side) */ +}; + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *sc_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(SC_FL_ISBACK, _(SC_FL_EOI, _(SC_FL_ERROR, _(SC_FL_NOLINGER, _(SC_FL_NOHALF, + _(SC_FL_DONT_WAKE, _(SC_FL_INDEP_STR, _(SC_FL_WONT_READ, + _(SC_FL_NEED_BUFF, _(SC_FL_NEED_ROOM, + _(SC_FL_RCV_ONCE, _(SC_FL_SND_ASAP, _(SC_FL_SND_NEVERWAIT, _(SC_FL_SND_EXP_MORE, + _(SC_FL_ABRT_WANTED, _(SC_FL_SHUT_WANTED, _(SC_FL_ABRT_DONE, _(SC_FL_SHUT_DONE, + _(SC_FL_EOS))))))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* A conn stream must have its own errors independently of the buffer's, so that + * applications can rely on what the buffer reports while the conn stream is + * performing some retries (eg: connection error). Some states are transient and + * do not last beyond process_session(). + */ +enum sc_state { + SC_ST_INI = 0, /* SC not sollicitated yet */ + SC_ST_REQ, /* [transient] connection initiation desired and not started yet */ + SC_ST_QUE, /* SC waiting in queue */ + SC_ST_TAR, /* SC in turn-around state after failed connect attempt */ + SC_ST_ASS, /* server just assigned to this SC */ + SC_ST_CON, /* initiated connection request (resource exists) */ + SC_ST_CER, /* [transient] previous connection attempt failed (resource released) */ + SC_ST_RDY, /* [transient] ready proven after I/O success during SC_ST_CON */ + SC_ST_EST, /* connection established (resource exists) */ + SC_ST_DIS, /* [transient] disconnected from other side, but cleanup not done yet */ + SC_ST_CLO, /* SC closed, might not existing anymore. Buffers shut. */ +} __attribute__((packed)); + +/* state bits for use with lists of states */ +enum sc_state_bit { + SC_SB_NONE = 0, + SC_SB_INI = 1U << SC_ST_INI, + SC_SB_REQ = 1U << SC_ST_REQ, + SC_SB_QUE = 1U << SC_ST_QUE, + SC_SB_TAR = 1U << SC_ST_TAR, + SC_SB_ASS = 1U << SC_ST_ASS, + SC_SB_CON = 1U << SC_ST_CON, + SC_SB_CER = 1U << SC_ST_CER, + SC_SB_RDY = 1U << SC_ST_RDY, + SC_SB_EST = 1U << SC_ST_EST, + SC_SB_DIS = 1U << SC_ST_DIS, + SC_SB_CLO = 1U << SC_ST_CLO, + SC_SB_ALL = SC_SB_INI|SC_SB_REQ|SC_SB_QUE|SC_SB_TAR|SC_SB_ASS|SC_SB_CON|SC_SB_CER|SC_SB_RDY|SC_SB_EST|SC_SB_DIS|SC_SB_CLO, +}; + +struct stconn; + +/* A Stream Endpoint Descriptor (sedesc) is the link between the stream + * connector (ex. stconn) and the Stream Endpoint (mux or appctx). + * It always exists for either of them, and binds them together. It also + * contains some shared information relative to the endpoint. It is created by + * the first one which needs it and is shared by the other one, i.e. on the + * client side, it's created the mux or applet and shared with the connector. + * An sedesc without stconn is called an ORPHANED descriptor. An sedesc with + * no mux/applet is called a DETACHED descriptor. Upon detach, the connector + * transfers the whole responsibility of the endpoint descriptor to the + * endpoint itself (mux/applet) and eventually creates a new sedesc (for + * instance on connection retries). + * + * <lra> should be updated when a read activity at the endpoint level is + * detected. It can be a successful receive or when a EOS/EOI is reported. + * A read activity is also reported when receives are unblocked. + + * <fsb> should be updated when the first send of a series is blocked and reset + * when a successful send is reported. + * + * + * NOTE: <lra> and <fsb> must only be used via the SC api to compute read/write + * expiration date. + * + */ +struct sedesc { + void *se; /* the stream endpoint, i.e. the mux stream or the appctx */ + struct connection *conn; /* the connection for connection-based streams */ + struct stconn *sc; /* the stream connector we're attached to, or NULL */ + struct iobuf iobuf; /* contains data forwarded by the other side and that must be sent by the stream endpoint */ + unsigned int flags; /* SE_FL_* */ + unsigned int lra; /* the last read activity */ + unsigned int fsb; /* the first send blocked */ + /* 4 bytes hole here */ + struct xref xref; /* cross reference with the opposite SC */ +}; + +/* sc_app_ops describes the application layer's operations and notification + * callbacks when I/O activity is reported and to use to perform shutr/shutw. + * There are very few combinations in practice (strm/chk <-> none/mux/applet). + */ +struct sc_app_ops { + void (*chk_rcv)(struct stconn *); /* chk_rcv function, may not be null */ + void (*chk_snd)(struct stconn *); /* chk_snd function, may not be null */ + void (*abort)(struct stconn *); /* abort function, may not be null */ + void (*shutdown)(struct stconn *); /* shutdown function, may not be null */ + int (*wake)(struct stconn *); /* data-layer callback to report activity */ + char name[8]; /* data layer name, zero-terminated */ +}; + +/* + * This structure describes the elements of a connection relevant to a stream + */ +struct stconn { + enum obj_type obj_type; /* differentiates connection from applet context */ + enum sc_state state; /* SC_ST* */ + /* 2 bytes hole here */ + + unsigned int flags; /* SC_FL_* */ + unsigned int ioto; /* I/O activity timeout */ + ssize_t room_needed; /* free space in the input buffer required to receive more data. + * -1 : the SC is waiting for room but not on a specific amount of data + * >= 0 : min free space required to progress. 0 means SC must be unblocked ASAP + */ + struct wait_event wait_event; /* We're in a wait list */ + struct sedesc *sedesc; /* points to the stream endpoint descriptor */ + enum obj_type *app; /* points to the applicative point (stream or check) */ + const struct sc_app_ops *app_ops; /* general operations used at the app layer */ + struct sockaddr_storage *src; /* source address (pool), when known, otherwise NULL */ + struct sockaddr_storage *dst; /* destination address (pool), when known, otherwise NULL */ +}; + + +#endif /* _HAPROXY_STCONN_T_H */ diff --git a/include/haproxy/stconn.h b/include/haproxy/stconn.h new file mode 100644 index 0000000..7869fa3 --- /dev/null +++ b/include/haproxy/stconn.h @@ -0,0 +1,557 @@ +/* + * include/haproxy/stconn.h + * This file contains stream connector function prototypes + * + * Copyright 2021 Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STCONN_H +#define _HAPROXY_STCONN_H + +#include <haproxy/api.h> +#include <haproxy/connection.h> +#include <haproxy/htx-t.h> +#include <haproxy/obj_type.h> +#include <haproxy/stconn-t.h> + +struct buffer; +struct session; +struct appctx; +struct stream; +struct check; + +#define IS_HTX_SC(sc) (sc_conn(sc) && IS_HTX_CONN(__sc_conn(sc))) + +struct sedesc *sedesc_new(); +void sedesc_free(struct sedesc *sedesc); + +struct stconn *sc_new_from_endp(struct sedesc *sedesc, struct session *sess, struct buffer *input); +struct stconn *sc_new_from_strm(struct stream *strm, unsigned int flags); +struct stconn *sc_new_from_check(struct check *check, unsigned int flags); +void sc_free(struct stconn *sc); + +int sc_attach_mux(struct stconn *sc, void *target, void *ctx); +int sc_attach_strm(struct stconn *sc, struct stream *strm); + +void sc_destroy(struct stconn *sc); +int sc_reset_endp(struct stconn *sc); + +struct appctx *sc_applet_create(struct stconn *sc, struct applet *app); + +void sc_conn_prepare_endp_upgrade(struct stconn *sc); +void sc_conn_abort_endp_upgrade(struct stconn *sc); +void sc_conn_commit_endp_upgrade(struct stconn *sc); + +/* The se_fl_*() set of functions manipulate the stream endpoint flags from + * the stream endpoint itself. The sc_ep_*() set of functions manipulate the + * stream endpoint flags from the the stream connector (ex. stconn). + * _zero() clears all flags, _clr() clears a set of flags (&=~), _set() sets + * a set of flags (|=), _test() tests the presence of a set of flags, _get() + * retrieves the exact flags, _setall() replaces the flags with the new value. + * All functions are purposely marked "forceinline" to avoid slowing down + * debugging code too much. None of these functions is atomic-safe. + */ + +/* stream endpoint version */ +static forceinline void se_fl_zero(struct sedesc *se) +{ + se->flags = 0; +} + +static forceinline void se_fl_setall(struct sedesc *se, uint all) +{ + se->flags = all; +} + +/* sets flags <on> on se->flags and handles ERR_PENDING to ERROR promotion if + * needed (upon EOI/EOS). + */ +static forceinline void se_fl_set(struct sedesc *se, uint on) +{ + if (((on & (SE_FL_EOS|SE_FL_EOI)) && se->flags & SE_FL_ERR_PENDING) || + ((on & SE_FL_ERR_PENDING) && se->flags & (SE_FL_EOI|SE_FL_EOS))) + on |= SE_FL_ERROR; + se->flags |= on; +} + +static forceinline void se_fl_clr(struct sedesc *se, uint off) +{ + se->flags &= ~off; +} + +static forceinline uint se_fl_test(const struct sedesc *se, uint test) +{ + return !!(se->flags & test); +} + +static forceinline uint se_fl_get(const struct sedesc *se) +{ + return se->flags; +} + +/* sets SE_FL_ERROR or SE_FL_ERR_PENDING on the endpoint */ +static inline void se_fl_set_error(struct sedesc *se) +{ + if (se_fl_test(se, (SE_FL_EOS|SE_FL_EOI))) + se_fl_set(se, SE_FL_ERROR); + else + se_fl_set(se, SE_FL_ERR_PENDING); +} + +static inline void se_expect_no_data(struct sedesc *se) +{ + se_fl_set(se, SE_FL_EXP_NO_DATA); +} + +static inline void se_expect_data(struct sedesc *se) +{ + se_fl_clr(se, SE_FL_EXP_NO_DATA); +} + +static inline unsigned int se_have_ff_data(struct sedesc *se) +{ + return (se->iobuf.data | (long)se->iobuf.pipe); +} + +static inline size_t se_ff_data(struct sedesc *se) +{ + return (se->iobuf.data + (se->iobuf.pipe ? se->iobuf.pipe->data : 0)); +} + +/* stream connector version */ +static forceinline void sc_ep_zero(struct stconn *sc) +{ + se_fl_zero(sc->sedesc); +} + +static forceinline void sc_ep_setall(struct stconn *sc, uint all) +{ + se_fl_setall(sc->sedesc, all); +} + +static forceinline void sc_ep_set(struct stconn *sc, uint on) +{ + se_fl_set(sc->sedesc, on); +} + +static forceinline void sc_ep_clr(struct stconn *sc, uint off) +{ + se_fl_clr(sc->sedesc, off); +} + +static forceinline uint sc_ep_test(const struct stconn *sc, uint test) +{ + return se_fl_test(sc->sedesc, test); +} + +static forceinline uint sc_ep_get(const struct stconn *sc) +{ + return se_fl_get(sc->sedesc); +} + +/* Return the last read activity timestamp. May be TICK_ETERNITY */ +static forceinline unsigned int sc_ep_lra(const struct stconn *sc) +{ + return sc->sedesc->lra; +} + +/* Return the first send blocked timestamp. May be TICK_ETERNITY */ +static forceinline unsigned int sc_ep_fsb(const struct stconn *sc) +{ + return sc->sedesc->fsb; +} + +/* Report a read activity. This function sets <lra> to now_ms */ +static forceinline void sc_ep_report_read_activity(struct stconn *sc) +{ + sc->sedesc->lra = now_ms; +} + +/* Report a send blocked. This function sets <fsb> to now_ms if it was not + * already set or if something was sent (to renew <fsb>). + * + * if something was sent (<did_send> != 0), a read activity is also reported for + * non-independent stream. + */ +static forceinline void sc_ep_report_blocked_send(struct stconn *sc, int did_send) +{ + if (did_send || !tick_isset(sc->sedesc->fsb)) { + sc->sedesc->fsb = now_ms; + if (did_send && !(sc->flags & SC_FL_INDEP_STR)) + sc_ep_report_read_activity(sc); + } +} + +/* Report a send activity by setting <fsb> to TICK_ETERNITY. + * For non-independent stream, a read activity is reported. + */ +static forceinline void sc_ep_report_send_activity(struct stconn *sc) +{ + sc->sedesc->fsb = TICK_ETERNITY; + if (!(sc->flags & SC_FL_INDEP_STR)) + sc_ep_report_read_activity(sc); +} + +static forceinline unsigned int sc_ep_have_ff_data(struct stconn *sc) +{ + return se_have_ff_data(sc->sedesc); +} + +static forceinline size_t sc_ep_ff_data(struct stconn *sc) +{ + return se_ff_data(sc->sedesc); +} + +/* Returns the stream endpoint from an connector, without any control */ +static inline void *__sc_endp(const struct stconn *sc) +{ + return sc->sedesc->se; +} + +/* Returns the connection from a sc if the endpoint is a mux stream. Otherwise + * NULL is returned. __sc_conn() returns the connection without any control + * while sc_conn() check the endpoint type. + */ +static inline struct connection *__sc_conn(const struct stconn *sc) +{ + return sc->sedesc->conn; +} +static inline struct connection *sc_conn(const struct stconn *sc) +{ + if (sc_ep_test(sc, SE_FL_T_MUX)) + return __sc_conn(sc); + return NULL; +} + +/* Returns the mux ops of the connection from an stconn if the endpoint is a + * mux stream. Otherwise NULL is returned. + */ +static inline const struct mux_ops *sc_mux_ops(const struct stconn *sc) +{ + const struct connection *conn = sc_conn(sc); + + return (conn ? conn->mux : NULL); +} + +/* Returns a pointer to the mux stream from a connector if the endpoint is + * a mux. Otherwise NULL is returned. __sc_mux_strm() returns the mux without + * any control while sc_mux_strm() checks the endpoint type. + */ +static inline void *__sc_mux_strm(const struct stconn *sc) +{ + return __sc_endp(sc); +} +static inline struct appctx *sc_mux_strm(const struct stconn *sc) +{ + if (sc_ep_test(sc, SE_FL_T_MUX)) + return __sc_mux_strm(sc); + return NULL; +} + +/* Returns the appctx from a sc if the endpoint is an appctx. Otherwise + * NULL is returned. __sc_appctx() returns the appctx without any control + * while sc_appctx() checks the endpoint type. + */ +static inline struct appctx *__sc_appctx(const struct stconn *sc) +{ + return __sc_endp(sc); +} +static inline struct appctx *sc_appctx(const struct stconn *sc) +{ + if (sc_ep_test(sc, SE_FL_T_APPLET)) + return __sc_appctx(sc); + return NULL; +} + +/* Returns the stream from a sc if the application is a stream. Otherwise + * NULL is returned. __sc_strm() returns the stream without any control + * while sc_strm() check the application type. + */ +static inline struct stream *__sc_strm(const struct stconn *sc) +{ + return __objt_stream(sc->app); +} + +static inline struct stream *sc_strm(const struct stconn *sc) +{ + if (obj_type(sc->app) == OBJ_TYPE_STREAM) + return __sc_strm(sc); + return NULL; +} + +/* Returns the healthcheck from a sc if the application is a + * healthcheck. Otherwise NULL is returned. __sc_check() returns the healthcheck + * without any control while sc_check() check the application type. + */ +static inline struct check *__sc_check(const struct stconn *sc) +{ + return __objt_check(sc->app); +} +static inline struct check *sc_check(const struct stconn *sc) +{ + if (obj_type(sc->app) == OBJ_TYPE_CHECK) + return __objt_check(sc->app); + return NULL; +} + +/* Returns the name of the application layer's name for the stconn, + * or "NONE" when none is attached. + */ +static inline const char *sc_get_data_name(const struct stconn *sc) +{ + if (!sc->app_ops) + return "NONE"; + return sc->app_ops->name; +} + +/* shut read */ +static inline void sc_conn_shutr(struct stconn *sc, enum co_shr_mode mode) +{ + const struct mux_ops *mux; + + BUG_ON(!sc_conn(sc)); + + if (sc_ep_test(sc, SE_FL_SHR)) + return; + + /* clean data-layer shutdown */ + mux = sc_mux_ops(sc); + if (mux && mux->shutr) + mux->shutr(sc, mode); + sc_ep_set(sc, (mode == CO_SHR_DRAIN) ? SE_FL_SHRD : SE_FL_SHRR); +} + +/* shut write */ +static inline void sc_conn_shutw(struct stconn *sc, enum co_shw_mode mode) +{ + const struct mux_ops *mux; + + BUG_ON(!sc_conn(sc)); + + if (sc_ep_test(sc, SE_FL_SHW)) + return; + + /* clean data-layer shutdown */ + mux = sc_mux_ops(sc); + if (mux && mux->shutw) + mux->shutw(sc, mode); + sc_ep_set(sc, (mode == CO_SHW_NORMAL) ? SE_FL_SHWN : SE_FL_SHWS); +} + +/* completely close a stream connector (but do not detach it) */ +static inline void sc_conn_shut(struct stconn *sc) +{ + sc_conn_shutw(sc, CO_SHW_SILENT); + sc_conn_shutr(sc, CO_SHR_RESET); +} + +/* completely close a stream connector after draining possibly pending data (but do not detach it) */ +static inline void sc_conn_drain_and_shut(struct stconn *sc) +{ + sc_conn_shutw(sc, CO_SHW_SILENT); + sc_conn_shutr(sc, CO_SHR_DRAIN); +} + +/* Returns non-zero if the stream connector's Rx path is blocked because of + * lack of room in the input buffer. This usually happens after applets failed + * to deliver data into the channel's buffer and reported it via sc_need_room(). + */ +__attribute__((warn_unused_result)) +static inline int sc_waiting_room(const struct stconn *sc) +{ + return !!(sc->flags & SC_FL_NEED_ROOM); +} + +/* The stream endpoint announces it has more data to deliver to the stream's + * input buffer. + */ +static inline void se_have_more_data(struct sedesc *se) +{ + se_fl_clr(se, SE_FL_HAVE_NO_DATA); +} + +/* The stream endpoint announces it doesn't have more data for the stream's + * input buffer. + */ +static inline void se_have_no_more_data(struct sedesc *se) +{ + se_fl_set(se, SE_FL_HAVE_NO_DATA); +} + +/* The application layer informs a stream connector that it's willing to + * receive data from the endpoint. A read activity is reported. + */ +static inline void sc_will_read(struct stconn *sc) +{ + if (sc->flags & SC_FL_WONT_READ) { + sc->flags &= ~SC_FL_WONT_READ; + sc_ep_report_read_activity(sc); + } +} + +/* The application layer informs a stream connector that it will not receive + * data from the endpoint (e.g. need to flush, bw limitations etc). Usually + * it corresponds to the channel's CF_DONT_READ flag. + */ +static inline void sc_wont_read(struct stconn *sc) +{ + sc->flags |= SC_FL_WONT_READ; +} + +/* An frontend (applet) stream endpoint tells the connector it needs the other + * side to connect or fail before continuing to work. This is used for example + * to allow an applet not to deliver data to a request channel before a + * connection is confirmed. + */ +static inline void se_need_remote_conn(struct sedesc *se) +{ + se_fl_set(se, SE_FL_APPLET_NEED_CONN); +} + +/* The application layer tells the stream connector that it just got the input + * buffer it was waiting for. A read activity is reported. + */ +static inline void sc_have_buff(struct stconn *sc) +{ + if (sc->flags & SC_FL_NEED_BUFF) { + sc->flags &= ~SC_FL_NEED_BUFF; + sc_ep_report_read_activity(sc); + } +} + +/* The stream connector failed to get an input buffer and is waiting for it. + * It indicates a willingness to deliver data to the buffer that will have to + * be retried. As such, callers will often automatically clear SE_FL_HAVE_NO_DATA + * to be called again as soon as SC_FL_NEED_BUFF is cleared. + */ +static inline void sc_need_buff(struct stconn *sc) +{ + sc->flags |= SC_FL_NEED_BUFF; +} + +/* Tell a stream connector some room was made in the input buffer and any + * failed attempt to inject data into it may be tried again. This is usually + * called after a successful transfer of buffer contents to the other side. + * A read activity is reported. + */ +static inline void sc_have_room(struct stconn *sc) +{ + if (sc->flags & SC_FL_NEED_ROOM) { + sc->flags &= ~SC_FL_NEED_ROOM; + sc->room_needed = 0; + sc_ep_report_read_activity(sc); + } +} + +/* The stream connector announces it failed to put data into the input buffer + * by lack of room. Since it indicates a willingness to deliver data to the + * buffer that will have to be retried. Usually the caller will also clear + * SE_FL_HAVE_NO_DATA to be called again as soon as SC_FL_NEED_ROOM is cleared. + * + * The caller is responsible to specified the amount of free space required to + * progress. It must take care to not exceed the buffer size. + */ +static inline void sc_need_room(struct stconn *sc, ssize_t room_needed) +{ + sc->flags |= SC_FL_NEED_ROOM; + BUG_ON_HOT(room_needed > (ssize_t)global.tune.bufsize); + sc->room_needed = room_needed; +} + +/* The stream endpoint indicates that it's ready to consume data from the + * stream's output buffer. Report a send activity if the SE is unblocked. + */ +static inline void se_will_consume(struct sedesc *se) +{ + if (se_fl_test(se, SE_FL_WONT_CONSUME)) { + se_fl_clr(se, SE_FL_WONT_CONSUME); + sc_ep_report_send_activity(se->sc); + } +} + +/* The stream endpoint indicates that it's not willing to consume data from the + * stream's output buffer. + */ +static inline void se_wont_consume(struct sedesc *se) +{ + se_fl_set(se, SE_FL_WONT_CONSUME); +} + +/* The stream endpoint indicates that it's willing to consume data from the + * stream's output buffer, but that there's not enough, so it doesn't want to + * be woken up until more are presented. + */ +static inline void se_need_more_data(struct sedesc *se) +{ + se_will_consume(se); + se_fl_set(se, SE_FL_WAIT_DATA); +} + + +static inline size_t se_nego_ff(struct sedesc *se, struct buffer *input, size_t count, unsigned int may_splice) +{ + size_t ret = 0; + + if (se_fl_test(se, SE_FL_T_MUX)) { + const struct mux_ops *mux = se->conn->mux; + + se->iobuf.flags &= ~IOBUF_FL_FF_BLOCKED; + if (mux->nego_fastfwd && mux->done_fastfwd) { + /* Disable zero-copy forwarding if EOS or an error was reported. */ + if (se_fl_test(se, SE_FL_EOS|SE_FL_ERROR|SE_FL_ERR_PENDING)) { + se->iobuf.flags |= IOBUF_FL_NO_FF; + goto end; + } + + ret = mux->nego_fastfwd(se->sc, input, count, may_splice); + if (se->iobuf.flags & IOBUF_FL_FF_BLOCKED) { + sc_ep_report_blocked_send(se->sc, 0); + + if (!(se->sc->wait_event.events & SUB_RETRY_SEND)) { + /* The SC must be subs for send to be notify when some + * space is made + */ + mux->subscribe(se->sc, SUB_RETRY_SEND, &se->sc->wait_event); + } + } + goto end; + } + } + se->iobuf.flags |= IOBUF_FL_NO_FF; + + end: + return ret; +} + +static inline void se_done_ff(struct sedesc *se) +{ + if (se_fl_test(se, SE_FL_T_MUX)) { + const struct mux_ops *mux = se->conn->mux; + size_t sent, to_send = se_ff_data(se); + + BUG_ON(!mux->done_fastfwd); + sent = mux->done_fastfwd(se->sc); + if (to_send) { + if (sent == to_send) + sc_ep_report_send_activity(se->sc); + else + sc_ep_report_blocked_send(se->sc, sent != 0); + } + } +} + +#endif /* _HAPROXY_STCONN_H */ diff --git a/include/haproxy/stick_table-t.h b/include/haproxy/stick_table-t.h new file mode 100644 index 0000000..749cb9a --- /dev/null +++ b/include/haproxy/stick_table-t.h @@ -0,0 +1,250 @@ +/* + * include/haproxy/stick_table-t.h + * Macros, variables and structures for stick tables management. + * + * Copyright (C) 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2010 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STICK_TABLE_T_H +#define _HAPROXY_STICK_TABLE_T_H + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/thread-t.h> + +#define STKTABLE_MAX_DT_ARRAY_SIZE 100 + +/* The types of extra data we can store in a stick table */ +enum { + STKTABLE_DT_SERVER_ID, /* the server ID to use with this stream if > 0 */ + STKTABLE_DT_GPT0, /* General Purpose Flag 0. */ + STKTABLE_DT_GPC0, /* General Purpose Counter 0 (unsigned 32-bit integer) */ + STKTABLE_DT_GPC0_RATE, /* General Purpose Counter 0's event rate */ + STKTABLE_DT_CONN_CNT, /* cumulated number of connections */ + STKTABLE_DT_CONN_RATE, /* incoming connection rate */ + STKTABLE_DT_CONN_CUR, /* concurrent number of connections */ + STKTABLE_DT_SESS_CNT, /* cumulated number of sessions (accepted connections) */ + STKTABLE_DT_SESS_RATE, /* accepted sessions rate */ + STKTABLE_DT_HTTP_REQ_CNT, /* cumulated number of incoming HTTP requests */ + STKTABLE_DT_HTTP_REQ_RATE,/* incoming HTTP request rate */ + STKTABLE_DT_HTTP_ERR_CNT, /* cumulated number of HTTP requests errors (4xx) */ + STKTABLE_DT_HTTP_ERR_RATE,/* HTTP request error rate */ + STKTABLE_DT_BYTES_IN_CNT, /* cumulated bytes count from client to servers */ + STKTABLE_DT_BYTES_IN_RATE,/* bytes rate from client to servers */ + STKTABLE_DT_BYTES_OUT_CNT,/* cumulated bytes count from servers to client */ + STKTABLE_DT_BYTES_OUT_RATE,/* bytes rate from servers to client */ + STKTABLE_DT_GPC1, /* General Purpose Counter 1 (unsigned 32-bit integer) */ + STKTABLE_DT_GPC1_RATE, /* General Purpose Counter 1's event rate */ + STKTABLE_DT_SERVER_KEY, /* The server key */ + STKTABLE_DT_HTTP_FAIL_CNT, /* cumulated number of HTTP server failures */ + STKTABLE_DT_HTTP_FAIL_RATE,/* HTTP server failures rate */ + STKTABLE_DT_GPT, /* array of gpt */ + STKTABLE_DT_GPC, /* array of gpc */ + STKTABLE_DT_GPC_RATE, /* array of gpc_rate */ + + + STKTABLE_STATIC_DATA_TYPES,/* number of types above */ + /* up to STKTABLE_EXTRA_DATA_TYPES types may be registered here, always + * followed by the number of data types, must always be last. + */ + STKTABLE_DATA_TYPES = STKTABLE_STATIC_DATA_TYPES + STKTABLE_EXTRA_DATA_TYPES +}; + +/* The equivalent standard types of the stored data */ +enum { + STD_T_SINT = 0, /* data is of type signed int */ + STD_T_UINT, /* data is of type unsigned int */ + STD_T_ULL, /* data is of type unsigned long long */ + STD_T_FRQP, /* data is of type freq_ctr */ + STD_T_DICT, /* data is of type key of dictionary entry */ +}; + +/* The types of optional arguments to stored data */ +enum { + ARG_T_NONE = 0, /* data type takes no argument (default) */ + ARG_T_INT, /* signed integer */ + ARG_T_DELAY, /* a delay which supports time units */ +}; + +/* They types of keys that servers can be identified by */ +enum { + STKTABLE_SRV_NAME = 0, + STKTABLE_SRV_ADDR, +}; + +/* stick table key type flags */ +#define STK_F_CUSTOM_KEYSIZE 0x00000001 /* this table's key size is configurable */ + +/* WARNING: if new fields are added, they must be initialized in stream_accept() + * and freed in stream_free() ! + * + * What's the purpose of there two macro: + * - STKCTR_TRACK_BACKEND indicates that a tracking pointer was set from the backend + * and thus that when a keep-alive request goes to another backend, the track + * must cease. + * + * - STKCTR_TRACK_CONTENT indicates that the tracking pointer was set in a + * content-aware rule (tcp-request content or http-request) and that the + * tracking has to be performed in the stream and not in the session, and + * will cease for a new keep-alive request over the same connection. + * + * These values are mixed with the stksess pointer in stkctr->entry. + */ +#define STKCTR_TRACK_BACKEND 1 +#define STKCTR_TRACK_CONTENT 2 + +/* stick_table extra data. This is mainly used for casting or size computation */ +union stktable_data { + /* standard types for easy casting */ + int std_t_sint; + unsigned int std_t_uint; + unsigned long long std_t_ull; + struct freq_ctr std_t_frqp; + struct dict_entry *std_t_dict; +}; + +/* known data types */ +struct stktable_data_type { + const char *name; /* name of the data type */ + int std_type; /* standard type we can use for this data, STD_T_* */ + int arg_type; /* type of optional argument, ARG_T_* */ + uint is_array:1; /* this is an array of gpc/gpt */ + uint is_local:1; /* this is local only and never learned */ + uint as_is:1; /* cannot be processed / used with arithmetic operations */ +}; + +/* stick table keyword type */ +struct stktable_type { + const char *kw; /* keyword string */ + int flags; /* type flags */ + size_t default_size; /* default key size */ +}; + +/* Sticky session. + * Any additional data related to the stuck session is installed *before* + * stksess (with negative offsets). This allows us to run variable-sized + * keys and variable-sized data without making use of intermediate pointers. + */ +struct stksess { + unsigned int expire; /* session expiration date */ + unsigned int ref_cnt; /* reference count, can only purge when zero */ + __decl_thread(HA_RWLOCK_T lock); /* lock related to the table entry */ + int shard; /* shard */ + struct eb32_node exp; /* ebtree node used to hold the session in expiration tree */ + struct eb32_node upd; /* ebtree node used to hold the update sequence tree */ + struct ebmb_node key; /* ebtree node used to hold the session in table */ + /* WARNING! do not put anything after <keys>, it's used by the key */ +}; + + +/* stick table */ +struct stktable { + char *id; /* local table id name. */ + size_t idlen; /* local table id name length. */ + char *nid; /* table id name sent over the network with peers protocol. */ + struct stktable *next; /* The stick-table may be linked when belonging to + * the same configuration section. + */ + struct ebpt_node name; /* Stick-table are lookup by name here. */ + struct pool_head *pool; /* pool used to allocate sticky sessions */ + struct task *exp_task; /* expiration task */ + struct task *sync_task; /* sync task */ + + uint64_t hash_seed; /* hash seed used by shards */ + union { + struct peers *p; /* sync peers */ + char *name; + } peers; + + unsigned long type; /* type of table (determines key format) */ + size_t key_size; /* size of a key, maximum size in case of string */ + unsigned int server_key_type; /* What type of key is used to identify servers */ + unsigned int size; /* maximum number of sticky sessions in table */ + int nopurge; /* if non-zero, don't purge sticky sessions when full */ + int expire; /* time to live for sticky sessions (milliseconds) */ + int data_size; /* the size of the data that is prepended *before* stksess */ + int data_ofs[STKTABLE_DATA_TYPES]; /* negative offsets of present data types, or 0 if absent */ + unsigned int data_nbelem[STKTABLE_DATA_TYPES]; /* to store nb_elem in case of array types */ + union { + int i; + unsigned int u; + void *p; + } data_arg[STKTABLE_DATA_TYPES]; /* optional argument of each data type */ + struct proxy *proxy; /* The proxy this stick-table is attached to, if any.*/ + union { + char *name; /* preparsing hint */ + struct stktable *t; /* postparsing */ + void *ptr; /* generic ptr to check if set or not */ + } write_to; /* updates received on the source table will also update write_to */ + + THREAD_ALIGN(64); + + struct eb_root keys; /* head of sticky session tree */ + struct eb_root exps; /* head of sticky session expiration tree */ + unsigned int refcnt; /* number of local peer over all peers sections + attached to this table */ + unsigned int current; /* number of sticky sessions currently in table */ + __decl_thread(HA_RWLOCK_T lock); /* lock related to the table */ + + THREAD_ALIGN(64); + + struct eb_root updates; /* head of sticky updates sequence tree, uses updt_lock */ + unsigned int update; /* uses updt_lock */ + unsigned int localupdate; /* uses updt_lock */ + unsigned int commitupdate;/* used to identify the latest local updates pending for sync, uses updt_lock */ + + THREAD_ALIGN(64); + /* this lock is heavily used and must be on its own cache line */ + __decl_thread(HA_RWLOCK_T updt_lock); /* lock protecting the updates part */ + + /* rarely used config stuff below (should not interfere with updt_lock) */ + struct proxy *proxies_list; /* The list of proxies which reference this stick-table. */ + struct { + const char *file; /* The file where the stick-table is declared. */ + int line; /* The line in this <file> the stick-table is declared. */ + } conf; +}; + +extern struct stktable_data_type stktable_data_types[STKTABLE_DATA_TYPES]; + +/* stick table key */ +struct stktable_key { + void *key; /* pointer on key buffer */ + size_t key_len; /* data len to read in buff in case of null terminated string */ +}; + +/* stick counter. The <entry> member is a composite address (caddr) made of a + * pointer to an stksess struct, and two flags among STKCTR_TRACK_* above. + */ +struct stkctr { + unsigned long entry; /* entry containing counters currently being tracked by this stream */ + struct stktable *table; /* table the counters above belong to (undefined if counters are null) */ +}; + +/* parameters to configure tracked counters */ +struct track_ctr_prm { + struct sample_expr *expr; /* expression used as the key */ + union { + struct stktable *t; /* a pointer to the table */ + char *n; /* or its name during parsing. */ + } table; +}; + +#endif /* _HAPROXY_STICK_TABLE_T_H */ diff --git a/include/haproxy/stick_table.h b/include/haproxy/stick_table.h new file mode 100644 index 0000000..3200437 --- /dev/null +++ b/include/haproxy/stick_table.h @@ -0,0 +1,404 @@ +/* + * include/haproxy/stick_table.h + * Functions for stick tables management. + * + * Copyright (C) 2009-2010 EXCELIANCE, Emeric Brun <ebrun@exceliance.fr> + * Copyright (C) 2010 Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STICK_TABLE_H +#define _HAPROXY_STICK_TABLE_H + +#include <haproxy/api.h> +#include <haproxy/dict-t.h> +#include <haproxy/errors.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/sample-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/ticks.h> + +extern struct stktable *stktables_list; +extern struct pool_head *pool_head_stk_ctr; +extern struct stktable_type stktable_types[]; + +#define stktable_data_size(type) (sizeof(((union stktable_data*)0)->type)) +#define stktable_data_cast(ptr, type) ((union stktable_data*)(ptr))->type + +void stktable_store_name(struct stktable *t); +struct stktable *stktable_find_by_name(const char *name); +struct stksess *stksess_new(struct stktable *t, struct stktable_key *key); +void stksess_setkey(struct stktable *t, struct stksess *ts, struct stktable_key *key); +void stksess_free(struct stktable *t, struct stksess *ts); +int stksess_kill(struct stktable *t, struct stksess *ts, int decrefcount); +int stktable_get_key_shard(struct stktable *t, const void *key, size_t len); + +int stktable_init(struct stktable *t, char **err_msg); +void stktable_deinit(struct stktable *t); +int stktable_parse_type(char **args, int *idx, unsigned long *type, size_t *key_size, const char *file, int linenum); +int parse_stick_table(const char *file, int linenum, char **args, + struct stktable *t, char *id, char *nid, struct peers *peers); +struct stksess *stktable_get_entry(struct stktable *table, struct stktable_key *key); +struct stksess *stktable_set_entry(struct stktable *table, struct stksess *nts); +void stktable_requeue_exp(struct stktable *t, const struct stksess *ts); +void stktable_touch_with_exp(struct stktable *t, struct stksess *ts, int decrefcount, int expire, int decrefcnt); +void stktable_touch_remote(struct stktable *t, struct stksess *ts, int decrefcnt); +void stktable_touch_local(struct stktable *t, struct stksess *ts, int decrefccount); +struct stksess *stktable_lookup(struct stktable *t, struct stksess *ts); +struct stksess *stktable_lookup_key(struct stktable *t, struct stktable_key *key); +struct stksess *stktable_update_key(struct stktable *table, struct stktable_key *key); +struct stktable_key *smp_to_stkey(struct sample *smp, struct stktable *t); +struct stktable_key *stktable_fetch_key(struct stktable *t, struct proxy *px, struct session *sess, + struct stream *strm, unsigned int opt, + struct sample_expr *expr, struct sample *smp); +struct stkctr *smp_fetch_sc_stkctr(struct session *sess, struct stream *strm, const struct arg *args, const char *kw, struct stkctr *stkctr); +struct stkctr *smp_create_src_stkctr(struct session *sess, struct stream *strm, const struct arg *args, const char *kw, struct stkctr *stkctr); +int stktable_compatible_sample(struct sample_expr *expr, unsigned long table_type); +int stktable_register_data_store(int idx, const char *name, int std_type, int arg_type); +int stktable_get_data_type(char *name); +int stktable_trash_oldest(struct stktable *t, int to_batch); +int __stksess_kill(struct stktable *t, struct stksess *ts); + +/************************* Composite address manipulation ********************* + * Composite addresses are simply unsigned long data in which the higher bits + * represent a pointer, and the two lower bits are flags. There are several + * places where we just want to associate one or two flags to a pointer (eg, + * to type it), and these functions permit this. The pointer is necessarily a + * 32-bit aligned pointer, as its two lower bits will be cleared and replaced + * with the flags. + *****************************************************************************/ + +/* Masks the two lower bits of a composite address and converts it to a + * pointer. This is used to mix some bits with some aligned pointers to + * structs and to retrieve the original (32-bit aligned) pointer. + */ +static inline void *caddr_to_ptr(unsigned long caddr) +{ + return (void *)(caddr & ~3UL); +} + +/* Only retrieves the two lower bits of a composite address. This is used to mix + * some bits with some aligned pointers to structs and to retrieve the original + * data (2 bits). + */ +static inline unsigned int caddr_to_data(unsigned long caddr) +{ + return (caddr & 3UL); +} + +/* Combines the aligned pointer whose 2 lower bits will be masked with the bits + * from <data> to form a composite address. This is used to mix some bits with + * some aligned pointers to structs and to retrieve the original (32-bit aligned) + * pointer. + */ +static inline unsigned long caddr_from_ptr(void *ptr, unsigned int data) +{ + return (((unsigned long)ptr) & ~3UL) + (data & 3); +} + +/* sets the 2 bits of <data> in the <caddr> composite address */ +static inline unsigned long caddr_set_flags(unsigned long caddr, unsigned int data) +{ + return caddr | (data & 3); +} + +/* clears the 2 bits of <data> in the <caddr> composite address */ +static inline unsigned long caddr_clr_flags(unsigned long caddr, unsigned int data) +{ + return caddr & ~(unsigned long)(data & 3); +} + + +/* return allocation size for standard data type <type> */ +static inline int stktable_type_size(int type) +{ + switch(type) { + case STD_T_SINT: + case STD_T_UINT: + return sizeof(int); + case STD_T_ULL: + return sizeof(unsigned long long); + case STD_T_FRQP: + return sizeof(struct freq_ctr); + case STD_T_DICT: + return sizeof(struct dict_entry *); + } + return 0; +} + +int stktable_alloc_data_type(struct stktable *t, int type, const char *sa, const char *sa2); + +/* return pointer for data type <type> in sticky session <ts> of table <t>, all + * of which must exist (otherwise use stktable_data_ptr() if unsure). + */ +static inline void *__stktable_data_ptr(struct stktable *t, struct stksess *ts, int type) +{ + return (void *)ts + t->data_ofs[type]; +} + +/* return pointer for data type <type> in sticky session <ts> of table <t>, or + * NULL if either <ts> is NULL or the type is not stored. + */ +static inline void *stktable_data_ptr(struct stktable *t, struct stksess *ts, int type) +{ + if (type >= STKTABLE_DATA_TYPES) + return NULL; + + if (!t->data_ofs[type]) /* type not stored */ + return NULL; + + if (!ts) + return NULL; + + return __stktable_data_ptr(t, ts, type); +} + +/* return pointer on the element of index <idx> from the array data type <type> + * in sticky session <ts> of table <t>, or NULL if either <ts> is NULL + * or this element is not stored because this type is not stored or + * requested index is greater than the number of elements of the array. + * Note: this function is also usable on non array types, they are + * considered as array of size 1, so a call with <idx> at 0 + * as the same behavior than 'stktable_data_ptr'. + */ +static inline void *stktable_data_ptr_idx(struct stktable *t, struct stksess *ts, int type, unsigned int idx) +{ + if (type >= STKTABLE_DATA_TYPES) + return NULL; + + if (!t->data_ofs[type]) /* type not stored */ + return NULL; + + if (!ts) + return NULL; + + if (t->data_nbelem[type] <= idx) + return NULL; + + return __stktable_data_ptr(t, ts, type) + idx*stktable_type_size(stktable_data_types[type].std_type); +} + +/* kill an entry if it's expired and its ref_cnt is zero */ +static inline int __stksess_kill_if_expired(struct stktable *t, struct stksess *ts) +{ + if (t->expire != TICK_ETERNITY && tick_is_expired(ts->expire, now_ms)) + return __stksess_kill(t, ts); + + return 0; +} + +static inline void stksess_kill_if_expired(struct stktable *t, struct stksess *ts, int decrefcnt) +{ + + if (decrefcnt && HA_ATOMIC_SUB_FETCH(&ts->ref_cnt, 1) != 0) + return; + + if (t->expire != TICK_ETERNITY && tick_is_expired(ts->expire, now_ms)) { + HA_RWLOCK_WRLOCK(STK_TABLE_LOCK, &t->lock); + __stksess_kill_if_expired(t, ts); + HA_RWLOCK_WRUNLOCK(STK_TABLE_LOCK, &t->lock); + } +} + +/* sets the stick counter's entry pointer */ +static inline void stkctr_set_entry(struct stkctr *stkctr, struct stksess *entry) +{ + stkctr->entry = caddr_from_ptr(entry, 0); +} + +/* returns the entry pointer from a stick counter */ +static inline struct stksess *stkctr_entry(struct stkctr *stkctr) +{ + return caddr_to_ptr(stkctr->entry); +} + +/* returns the two flags from a stick counter */ +static inline unsigned int stkctr_flags(struct stkctr *stkctr) +{ + return caddr_to_data(stkctr->entry); +} + +/* sets up to two flags at a time on a composite address */ +static inline void stkctr_set_flags(struct stkctr *stkctr, unsigned int flags) +{ + stkctr->entry = caddr_set_flags(stkctr->entry, flags); +} + +/* returns the two flags from a stick counter */ +static inline void stkctr_clr_flags(struct stkctr *stkctr, unsigned int flags) +{ + stkctr->entry = caddr_clr_flags(stkctr->entry, flags); +} + +/* Increase the number of cumulated HTTP requests in the tracked counter + * <stkctr>. It returns 0 if the entry pointer does not exist and nothing is + * performed. Otherwise it returns 1. + */ +static inline int stkctr_inc_http_req_ctr(struct stkctr *stkctr) +{ + struct stksess *ts; + void *ptr1, *ptr2; + + ts = stkctr_entry(stkctr); + if (!ts) + return 0; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_REQ_CNT); + if (ptr1) + stktable_data_cast(ptr1, std_t_uint)++; + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_REQ_RATE); + if (ptr2) + update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_HTTP_REQ_RATE].u, 1); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + if (ptr1 || ptr2) + stktable_touch_local(stkctr->table, ts, 0); + return 1; +} + +/* Increase the number of cumulated failed HTTP requests in the tracked counter + * <stkctr>. It returns 0 if the entry pointer does not exist and nothing is + * performed. Otherwise it returns 1. + */ +static inline int stkctr_inc_http_err_ctr(struct stkctr *stkctr) +{ + struct stksess *ts; + void *ptr1, *ptr2; + + ts = stkctr_entry(stkctr); + if (!ts) + return 0; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_ERR_CNT); + if (ptr1) + stktable_data_cast(ptr1, std_t_uint)++; + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_ERR_RATE); + if (ptr2) + update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_HTTP_ERR_RATE].u, 1); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + if (ptr1 || ptr2) + stktable_touch_local(stkctr->table, ts, 0); + return 1; +} + +/* Increase the number of cumulated failed HTTP responses in the tracked counter + * <stkctr>. It returns 0 if the entry pointer does not exist and nothing is + * performed. Otherwise it returns 1. + */ +static inline int stkctr_inc_http_fail_ctr(struct stkctr *stkctr) +{ + struct stksess *ts; + void *ptr1, *ptr2; + + ts = stkctr_entry(stkctr); + if (!ts) + return 0; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_FAIL_CNT); + if (ptr1) + stktable_data_cast(ptr1, std_t_uint)++; + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_HTTP_FAIL_RATE); + if (ptr2) + update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_HTTP_FAIL_RATE].u, 1); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + if (ptr1 || ptr2) + stktable_touch_local(stkctr->table, ts, 0); + return 1; +} + +/* Increase the number of bytes received in the tracked counter <stkctr>. It + * returns 0 if the entry pointer does not exist and nothing is + * performed. Otherwise it returns 1. + */ +static inline int stkctr_inc_bytes_in_ctr(struct stkctr *stkctr, unsigned long long bytes) +{ + struct stksess *ts; + void *ptr1, *ptr2; + + ts = stkctr_entry(stkctr); + if (!ts) + return 0; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_IN_CNT); + if (ptr1) + stktable_data_cast(ptr1, std_t_ull) += bytes; + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_IN_RATE); + if (ptr2) + update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_BYTES_IN_RATE].u, bytes); + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + + /* If data was modified, we need to touch to re-schedule sync */ + if (ptr1 || ptr2) + stktable_touch_local(stkctr->table, ts, 0); + return 1; +} + +/* Increase the number of bytes sent in the tracked counter <stkctr>. It + * returns 0 if the entry pointer does not exist and nothing is + * performed. Otherwise it returns 1. + */ +static inline int stkctr_inc_bytes_out_ctr(struct stkctr *stkctr, unsigned long long bytes) +{ + struct stksess *ts; + void *ptr1, *ptr2; + + ts = stkctr_entry(stkctr); + if (!ts) + return 0; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + ptr1 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_OUT_CNT); + if (ptr1) + stktable_data_cast(ptr1, std_t_ull) += bytes; + + ptr2 = stktable_data_ptr(stkctr->table, ts, STKTABLE_DT_BYTES_OUT_RATE); + if (ptr2) + update_freq_ctr_period(&stktable_data_cast(ptr2, std_t_frqp), + stkctr->table->data_arg[STKTABLE_DT_BYTES_OUT_RATE].u, bytes); + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + + /* If data was modified, we need to touch to re-schedule sync */ + if (ptr1 || ptr2) + stktable_touch_local(stkctr->table, ts, 0); + return 1; +} + +#endif /* _HAPROXY_STICK_TABLE_H */ diff --git a/include/haproxy/stream-t.h b/include/haproxy/stream-t.h new file mode 100644 index 0000000..7e79b96 --- /dev/null +++ b/include/haproxy/stream-t.h @@ -0,0 +1,301 @@ +/* + * include/haproxy/stream-t.h + * This file defines everything related to streams. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STREAM_T_H +#define _HAPROXY_STREAM_T_H + +#include <sys/time.h> + +#include <haproxy/api-t.h> +#include <haproxy/channel-t.h> +#include <haproxy/stconn-t.h> +#include <haproxy/dynbuf-t.h> +#include <haproxy/filters-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/show_flags-t.h> +#include <haproxy/stick_table-t.h> +#include <haproxy/vars-t.h> + + +/* Various Stream Flags, bits values 0x01 to 0x100 (shift 0). + * Please also update the txn_show_flags() function below in case of changes. + */ +#define SF_DIRECT 0x00000001 /* connection made on the server matching the client cookie */ +#define SF_ASSIGNED 0x00000002 /* no need to assign a server to this stream */ +/* unused: 0x00000004 */ +#define SF_BE_ASSIGNED 0x00000008 /* a backend was assigned. Conns are accounted. */ + +#define SF_FORCE_PRST 0x00000010 /* force persistence here, even if server is down */ +#define SF_MONITOR 0x00000020 /* this stream comes from a monitoring system */ +#define SF_CURR_SESS 0x00000040 /* a connection is currently being counted on the server */ +#define SF_CONN_EXP 0x00000080 /* timeout has expired */ +#define SF_REDISP 0x00000100 /* set if this stream was redispatched from one server to another */ +#define SF_IGNORE 0x00000200 /* The stream lead to a mux upgrade, and should be ignored */ +#define SF_REDIRECTABLE 0x00000400 /* set if this stream is redirectable (GET or HEAD) */ +#define SF_HTX 0x00000800 /* set if this stream is an htx stream */ + +/* stream termination conditions, bits values 0x1000 to 0x7000 (0-9 shift 12) */ +#define SF_ERR_NONE 0x00000000 /* normal end of request */ +#define SF_ERR_LOCAL 0x00001000 /* the proxy locally processed this request => not an error */ +#define SF_ERR_CLITO 0x00002000 /* client time-out */ +#define SF_ERR_CLICL 0x00003000 /* client closed (read/write error) */ +#define SF_ERR_SRVTO 0x00004000 /* server time-out, connect time-out */ +#define SF_ERR_SRVCL 0x00005000 /* server closed (connect/read/write error) */ +#define SF_ERR_PRXCOND 0x00006000 /* the proxy decided to close (deny...) */ +#define SF_ERR_RESOURCE 0x00007000 /* the proxy encountered a lack of a local resources (fd, mem, ...) */ +#define SF_ERR_INTERNAL 0x00008000 /* the proxy encountered an internal error */ +#define SF_ERR_DOWN 0x00009000 /* the proxy killed a stream because the backend became unavailable */ +#define SF_ERR_KILLED 0x0000a000 /* the proxy killed a stream because it was asked to do so */ +#define SF_ERR_UP 0x0000b000 /* the proxy killed a stream because a preferred backend became available */ +#define SF_ERR_CHK_PORT 0x0000c000 /* no port could be found for a health check. TODO: check SF_ERR_SHIFT */ +#define SF_ERR_MASK 0x0000f000 /* mask to get only stream error flags */ +#define SF_ERR_SHIFT 12 /* bit shift */ + +/* stream state at termination, bits values 0x10000 to 0x70000 (0-7 shift 16) */ +#define SF_FINST_R 0x00010000 /* stream ended during client request */ +#define SF_FINST_C 0x00020000 /* stream ended during server connect */ +#define SF_FINST_H 0x00030000 /* stream ended during server headers */ +#define SF_FINST_D 0x00040000 /* stream ended during data phase */ +#define SF_FINST_L 0x00050000 /* stream ended while pushing last data to client */ +#define SF_FINST_Q 0x00060000 /* stream ended while waiting in queue for a server slot */ +#define SF_FINST_T 0x00070000 /* stream ended tarpitted */ +#define SF_FINST_MASK 0x00070000 /* mask to get only final stream state flags */ +#define SF_FINST_SHIFT 16 /* bit shift */ + +#define SF_IGNORE_PRST 0x00080000 /* ignore persistence */ + +#define SF_SRV_REUSED 0x00100000 /* the server-side connection was reused */ +#define SF_SRV_REUSED_ANTICIPATED 0x00200000 /* the connection was reused but the mux is not ready yet */ +#define SF_WEBSOCKET 0x00400000 /* websocket stream */ // TODO: must be removed +#define SF_SRC_ADDR 0x00800000 /* get the source ip/port with getsockname */ + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG and __APPEND_ENUM macros. The new end of the buffer is + * returned. + */ +static forceinline char *strm_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) +#define _e(m, e, ...) __APPEND_ENUM(buf, len, delim, flg, m, e, #e, __VA_ARGS__) + /* prologue */ + _(0); + /* flags & enums */ + _(SF_IGNORE_PRST, _(SF_SRV_REUSED, _(SF_SRV_REUSED_ANTICIPATED, + _(SF_WEBSOCKET, _(SF_SRC_ADDR))))); + + _e(SF_FINST_MASK, SF_FINST_R, _e(SF_FINST_MASK, SF_FINST_C, + _e(SF_FINST_MASK, SF_FINST_H, _e(SF_FINST_MASK, SF_FINST_D, + _e(SF_FINST_MASK, SF_FINST_L, _e(SF_FINST_MASK, SF_FINST_Q, + _e(SF_FINST_MASK, SF_FINST_T))))))); + + _e(SF_ERR_MASK, SF_ERR_LOCAL, _e(SF_ERR_MASK, SF_ERR_CLITO, + _e(SF_ERR_MASK, SF_ERR_CLICL, _e(SF_ERR_MASK, SF_ERR_SRVTO, + _e(SF_ERR_MASK, SF_ERR_SRVCL, _e(SF_ERR_MASK, SF_ERR_PRXCOND, + _e(SF_ERR_MASK, SF_ERR_RESOURCE, _e(SF_ERR_MASK, SF_ERR_INTERNAL, + _e(SF_ERR_MASK, SF_ERR_DOWN, _e(SF_ERR_MASK, SF_ERR_KILLED, + _e(SF_ERR_MASK, SF_ERR_UP, _e(SF_ERR_MASK, SF_ERR_CHK_PORT)))))))))))); + + _(SF_DIRECT, _(SF_ASSIGNED, _(SF_BE_ASSIGNED, _(SF_FORCE_PRST, + _(SF_MONITOR, _(SF_CURR_SESS, _(SF_CONN_EXP, _(SF_REDISP, + _(SF_IGNORE, _(SF_REDIRECTABLE, _(SF_HTX))))))))))); + + /* epilogue */ + _(~0U); + return buf; +#undef _e +#undef _ +} + + +/* flags for the proxy of the master CLI */ +/* 0x0001.. to 0x8000 are reserved for ACCESS_* flags from cli-t.h */ + +#define PCLI_F_PROMPT 0x10000 +#define PCLI_F_PAYLOAD 0x20000 +#define PCLI_F_RELOAD 0x40000 /* this is the "reload" stream, quits after displaying reload status */ +#define PCLI_F_TIMED 0x80000 /* the prompt shows the process' uptime */ + + +/* error types reported on the streams for more accurate reporting. + * Please also update the strm_et_show_flags() function below in case of changes. + */ +enum { + STRM_ET_NONE = 0x0000, /* no error yet, leave it to zero */ + STRM_ET_QUEUE_TO = 0x0001, /* queue timeout */ + STRM_ET_QUEUE_ERR = 0x0002, /* queue error (eg: full) */ + STRM_ET_QUEUE_ABRT = 0x0004, /* aborted in queue by external cause */ + STRM_ET_CONN_TO = 0x0008, /* connection timeout */ + STRM_ET_CONN_ERR = 0x0010, /* connection error (eg: no server available) */ + STRM_ET_CONN_ABRT = 0x0020, /* connection aborted by external cause (eg: abort) */ + STRM_ET_CONN_RES = 0x0040, /* connection aborted due to lack of resources */ + STRM_ET_CONN_OTHER = 0x0080, /* connection aborted for other reason (eg: 500) */ + STRM_ET_DATA_TO = 0x0100, /* timeout during data phase */ + STRM_ET_DATA_ERR = 0x0200, /* error during data phase */ + STRM_ET_DATA_ABRT = 0x0400, /* data phase aborted by external cause */ +}; + +/* This function is used to report flags in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *strm_et_show_flags(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(STRM_ET_QUEUE_TO, _(STRM_ET_QUEUE_ERR, _(STRM_ET_QUEUE_ABRT, + _(STRM_ET_CONN_TO, _(STRM_ET_CONN_ERR, _(STRM_ET_CONN_ABRT, + _(STRM_ET_CONN_RES, _(STRM_ET_CONN_OTHER, _(STRM_ET_DATA_TO, + _(STRM_ET_DATA_ERR, _(STRM_ET_DATA_ABRT))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +struct hlua; +struct proxy; +struct pendconn; +struct session; +struct server; +struct task; +struct sockaddr_storage; + +/* some external definitions */ +struct strm_logs { + int logwait; /* log fields waiting to be collected : LW_* */ + int level; /* log level to force + 1 if > 0, -1 = no log */ + struct timeval accept_date; /* date of the stream's accept() in user date */ + ullong accept_ts; /* date of the session's accept() in internal date (monotonic) */ + long t_handshake; /* handshake duration, -1 if never occurs */ + long t_idle; /* idle duration, -1 if never occurs */ + ullong request_ts; /* date when the request arrives in internal date */ + long t_queue; /* delay before the stream gets out of the connect queue, -1 if never occurs */ + long t_connect; /* delay before the connect() to the server succeeds, -1 if never occurs */ + long t_data; /* delay before the first data byte from the server ... */ + unsigned long t_close; /* total stream duration */ + unsigned long srv_queue_pos; /* number of streams de-queued while waiting for a connection slot on this server */ + unsigned long prx_queue_pos; /* number of streams de-qeuued while waiting for a connection slot on this instance */ + long long bytes_in; /* number of bytes transferred from the client to the server */ + long long bytes_out; /* number of bytes transferred from the server to the client */ +}; + +struct stream { + enum obj_type obj_type; /* object type == OBJ_TYPE_STREAM */ + enum sc_state prev_conn_state; /* CS_ST*, copy of previous state of the server stream connector */ + + int16_t priority_class; /* priority class of the stream for the pending queue */ + int32_t priority_offset; /* priority offset of the stream for the pending queue */ + + int flags; /* some flags describing the stream */ + unsigned int uniq_id; /* unique ID used for the traces */ + enum obj_type *target; /* target to use for this stream */ + + struct session *sess; /* the session this stream is attached to */ + + struct channel req; /* request channel */ + struct channel res; /* response channel */ + + struct proxy *be; /* the proxy this stream depends on for the server side */ + + struct server *srv_conn; /* stream already has a slot on a server and is not in queue */ + struct pendconn *pend_pos; /* if not NULL, points to the pending position in the pending queue */ + + struct http_txn *txn; /* current HTTP transaction being processed. Should become a list. */ + + struct task *task; /* the task associated with this stream */ + unsigned int pending_events; /* the pending events not yet processed by the stream. + * This is a bit field of TASK_WOKEN_* */ + int conn_retries; /* number of connect retries performed */ + unsigned int conn_exp; /* wake up time for connect, queue, turn-around, ... */ + unsigned int conn_err_type; /* first error detected, one of STRM_ET_* */ + struct list list; /* position in the thread's streams list */ + struct mt_list by_srv; /* position in server stream list */ + struct list back_refs; /* list of users tracking this stream */ + struct buffer_wait buffer_wait; /* position in the list of objects waiting for a buffer */ + + uint64_t lat_time; /* total latency time experienced */ + uint64_t cpu_time; /* total CPU time consumed */ + struct freq_ctr call_rate; /* stream task call rate without making progress */ + + short store_count; + /* 2 unused bytes here */ + + struct { + struct stksess *ts; + struct stktable *table; + } store[8]; /* tracked stickiness values to store */ + + struct stkctr *stkctr; /* content-aware stick counters */ + + struct strm_flt strm_flt; /* current state of filters active on this stream */ + + char **req_cap; /* array of captures from the request (may be NULL) */ + char **res_cap; /* array of captures from the response (may be NULL) */ + struct vars vars_txn; /* list of variables for the txn scope. */ + struct vars vars_reqres; /* list of variables for the request and resp scope. */ + + struct stconn *scf; /* frontend stream connector */ + struct stconn *scb; /* backend stream connector */ + + struct strm_logs logs; /* logs for this stream */ + + void (*do_log)(struct stream *s); /* the function to call in order to log (or NULL) */ + void (*srv_error)(struct stream *s, /* the function to call upon unrecoverable server errors (or NULL) */ + struct stconn *sc); + + int pcli_next_pid; /* next target PID to use for the CLI proxy */ + int pcli_flags; /* flags for CLI proxy */ + char pcli_payload_pat[8]; /* payload pattern for the CLI proxy */ + + struct ist unique_id; /* custom unique ID */ + + /* These two pointers are used to resume the execution of the rule lists. */ + struct list *current_rule_list; /* this is used to store the current executed rule list. */ + void *current_rule; /* this is used to store the current rule to be resumed. */ + int rules_exp; /* expiration date for current rules execution */ + int tunnel_timeout; + const char *last_rule_file; /* last evaluated final rule's file (def: NULL) */ + int last_rule_line; /* last evaluated final rule's line (def: 0) */ + + unsigned int stream_epoch; /* copy of stream_epoch when the stream was created */ + struct hlua *hlua; /* lua runtime context */ + + /* Context */ + struct { + struct resolv_requester *requester; /* owner of the resolution */ + struct act_rule *parent; /* rule which requested this resolution */ + char *hostname_dn; /* hostname being resolve, in domain name format */ + int hostname_dn_len; /* size of hostname_dn */ + /* 4 unused bytes here, recoverable via packing if needed */ + } resolv_ctx; /* context information for DNS resolution */ +}; + +#endif /* _HAPROXY_STREAM_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/stream.h b/include/haproxy/stream.h new file mode 100644 index 0000000..a884007 --- /dev/null +++ b/include/haproxy/stream.h @@ -0,0 +1,404 @@ +/* + * include/haproxy/stream.h + * This file defines everything related to streams. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_STREAM_H +#define _HAPROXY_STREAM_H + +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/fd.h> +#include <haproxy/freq_ctr.h> +#include <haproxy/obj_type.h> +#include <haproxy/pool-t.h> +#include <haproxy/queue.h> +#include <haproxy/session.h> +#include <haproxy/stconn.h> +#include <haproxy/stick_table.h> +#include <haproxy/stream-t.h> +#include <haproxy/task-t.h> +#include <haproxy/trace-t.h> + +extern struct trace_source trace_strm; + +/* Details about these events are defined in <src/stream.c> */ +#define STRM_EV_STRM_NEW (1ULL << 0) +#define STRM_EV_STRM_FREE (1ULL << 1) +#define STRM_EV_STRM_ERR (1ULL << 2) +#define STRM_EV_STRM_ANA (1ULL << 3) +#define STRM_EV_STRM_PROC (1ULL << 4) +#define STRM_EV_CS_ST (1ULL << 5) +#define STRM_EV_HTTP_ANA (1ULL << 6) +#define STRM_EV_HTTP_ERR (1ULL << 7) +#define STRM_EV_TCP_ANA (1ULL << 8) +#define STRM_EV_TCP_ERR (1ULL << 9) +#define STRM_EV_FLT_ANA (1ULL << 10) +#define STRM_EV_FLT_ERR (1ULL << 11) + +#define IS_HTX_STRM(strm) ((strm)->flags & SF_HTX) + +extern struct pool_head *pool_head_stream; +extern struct pool_head *pool_head_uniqueid; + +extern struct data_cb sess_conn_cb; + +struct stream *stream_new(struct session *sess, struct stconn *sc, struct buffer *input); +void stream_free(struct stream *s); +int stream_upgrade_from_sc(struct stconn *sc, struct buffer *input); +int stream_set_http_mode(struct stream *s, const struct mux_proto_list *mux_proto); + +/* kill a stream and set the termination flags to <why> (one of SF_ERR_*) */ +void stream_shutdown(struct stream *stream, int why); +void stream_dump_and_crash(enum obj_type *obj, int rate); +void strm_dump_to_buffer(struct buffer *buf, const struct stream *strm, const char *pfx, uint32_t anon_key); + +struct ist stream_generate_unique_id(struct stream *strm, struct list *format); + +void stream_process_counters(struct stream *s); +void sess_change_server(struct stream *strm, struct server *newsrv); +struct task *process_stream(struct task *t, void *context, unsigned int state); +void default_srv_error(struct stream *s, struct stconn *sc); + +/* Update the stream's backend and server time stats */ +void stream_update_time_stats(struct stream *s); +void stream_release_buffers(struct stream *s); +int stream_buf_available(void *arg); + +/* returns the session this stream belongs to */ +static inline struct session *strm_sess(const struct stream *strm) +{ + return strm->sess; +} + +/* returns the frontend this stream was initiated from */ +static inline struct proxy *strm_fe(const struct stream *strm) +{ + return strm->sess->fe; +} + +/* returns the listener this stream was initiated from */ +static inline struct listener *strm_li(const struct stream *strm) +{ + return strm->sess->listener; +} + +/* returns a pointer to the origin of the session which created this stream */ +static inline enum obj_type *strm_orig(const struct stream *strm) +{ + return strm->sess->origin; +} + +/* Remove the refcount from the stream to the tracked counters, and clear the + * pointer to ensure this is only performed once. The caller is responsible for + * ensuring that the pointer is valid first. We must be extremely careful not + * to touch the entries we inherited from the session. + */ +static inline void stream_store_counters(struct stream *s) +{ + void *ptr; + int i; + struct stksess *ts; + + if (unlikely(!s->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + ts = stkctr_entry(&s->stkctr[i]); + if (!ts) + continue; + + if (stkctr_entry(&s->sess->stkctr[i])) + continue; + + ptr = stktable_data_ptr(s->stkctr[i].table, ts, STKTABLE_DT_CONN_CUR); + if (ptr) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (stktable_data_cast(ptr, std_t_uint) > 0) + stktable_data_cast(ptr, std_t_uint)--; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(s->stkctr[i].table, ts, 0); + } + stkctr_set_entry(&s->stkctr[i], NULL); + stksess_kill_if_expired(s->stkctr[i].table, ts, 1); + } +} + +/* Remove the refcount from the stream counters tracked at the content level if + * any, and clear the pointer to ensure this is only performed once. The caller + * is responsible for ensuring that the pointer is valid first. We must be + * extremely careful not to touch the entries we inherited from the session. + */ +static inline void stream_stop_content_counters(struct stream *s) +{ + struct stksess *ts; + void *ptr; + int i; + + if (unlikely(!s->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + ts = stkctr_entry(&s->stkctr[i]); + if (!ts) + continue; + + if (stkctr_entry(&s->sess->stkctr[i])) + continue; + + if (!(stkctr_flags(&s->stkctr[i]) & STKCTR_TRACK_CONTENT)) + continue; + + ptr = stktable_data_ptr(s->stkctr[i].table, ts, STKTABLE_DT_CONN_CUR); + if (ptr) { + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + if (stktable_data_cast(ptr, std_t_uint) > 0) + stktable_data_cast(ptr, std_t_uint)--; + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(s->stkctr[i].table, ts, 0); + } + stkctr_set_entry(&s->stkctr[i], NULL); + stksess_kill_if_expired(s->stkctr[i].table, ts, 1); + } +} + +/* Increase total and concurrent connection count for stick entry <ts> of table + * <t>. The caller is responsible for ensuring that <t> and <ts> are valid + * pointers, and for calling this only once per connection. + */ +static inline void stream_start_counters(struct stktable *t, struct stksess *ts) +{ + void *ptr; + + HA_RWLOCK_WRLOCK(STK_SESS_LOCK, &ts->lock); + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_CUR); + if (ptr) + stktable_data_cast(ptr, std_t_uint)++; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_CNT); + if (ptr) + stktable_data_cast(ptr, std_t_uint)++; + + ptr = stktable_data_ptr(t, ts, STKTABLE_DT_CONN_RATE); + if (ptr) + update_freq_ctr_period(&stktable_data_cast(ptr, std_t_frqp), + t->data_arg[STKTABLE_DT_CONN_RATE].u, 1); + if (tick_isset(t->expire)) + ts->expire = tick_add(now_ms, MS_TO_TICKS(t->expire)); + + HA_RWLOCK_WRUNLOCK(STK_SESS_LOCK, &ts->lock); + + /* If data was modified, we need to touch to re-schedule sync */ + stktable_touch_local(t, ts, 0); +} + +/* Enable tracking of stream counters as <stkctr> on stksess <ts>. The caller is + * responsible for ensuring that <t> and <ts> are valid pointers. Some controls + * are performed to ensure the state can still change. + */ +static inline void stream_track_stkctr(struct stkctr *ctr, struct stktable *t, struct stksess *ts) +{ + /* Why this test ???? */ + if (stkctr_entry(ctr)) + return; + + ctr->table = t; + stkctr_set_entry(ctr, ts); + stream_start_counters(t, ts); +} + +/* Increase the number of cumulated HTTP requests in the tracked counters */ +static inline void stream_inc_http_req_ctr(struct stream *s) +{ + int i; + + if (unlikely(!s->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + if (!stkctr_inc_http_req_ctr(&s->stkctr[i])) + stkctr_inc_http_req_ctr(&s->sess->stkctr[i]); + } +} + +/* Increase the number of cumulated HTTP requests in the backend's tracked + * counters. We don't look up the session since it cannot happen in the backend. + */ +static inline void stream_inc_be_http_req_ctr(struct stream *s) +{ + int i; + + if (unlikely(!s->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + if (!stkctr_entry(&s->stkctr[i]) || !(stkctr_flags(&s->stkctr[i]) & STKCTR_TRACK_BACKEND)) + continue; + + stkctr_inc_http_req_ctr(&s->stkctr[i]); + } +} + +/* Increase the number of cumulated failed HTTP requests in the tracked + * counters. Only 4xx requests should be counted here so that we can + * distinguish between errors caused by client behaviour and other ones. + * Note that even 404 are interesting because they're generally caused by + * vulnerability scans. + */ +static inline void stream_inc_http_err_ctr(struct stream *s) +{ + int i; + + if (unlikely(!s->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + if (!stkctr_inc_http_err_ctr(&s->stkctr[i])) + stkctr_inc_http_err_ctr(&s->sess->stkctr[i]); + } +} + +/* Increase the number of cumulated failed HTTP responses in the tracked + * counters. Only some 5xx responses should be counted here so that we can + * distinguish between server failures and errors triggered by the client + * (i.e. 501 and 505 may be triggered and must be ignored). + */ +static inline void stream_inc_http_fail_ctr(struct stream *s) +{ + int i; + + if (unlikely(!s->stkctr)) // pool not allocated yet + return; + + for (i = 0; i < global.tune.nb_stk_ctr; i++) { + if (!stkctr_inc_http_fail_ctr(&s->stkctr[i])) + stkctr_inc_http_fail_ctr(&s->sess->stkctr[i]); + } +} + +static inline void stream_add_srv_conn(struct stream *strm, struct server *srv) +{ + /* note: this inserts in reverse order but we do not care, it's only + * used for massive kills (i.e. almost never). MT_LIST_INSERT() is a bit + * faster than MT_LIST_APPEND under contention due to a faster recovery + * from a conflict with an adjacent MT_LIST_DELETE, and using it improves + * the performance by about 3% on 32-cores. + */ + MT_LIST_INSERT(&srv->per_thr[tid].streams, &strm->by_srv); + HA_ATOMIC_STORE(&strm->srv_conn, srv); +} + +static inline void stream_del_srv_conn(struct stream *strm) +{ + struct server *srv = strm->srv_conn; + + if (!srv) + return; + + MT_LIST_DELETE(&strm->by_srv); + HA_ATOMIC_STORE(&strm->srv_conn, NULL); +} + +static inline void stream_init_srv_conn(struct stream *strm) +{ + strm->srv_conn = NULL; + MT_LIST_INIT(&strm->by_srv); +} + +static inline void stream_choose_redispatch(struct stream *s) +{ + /* If the "redispatch" option is set on the backend, we are allowed to + * retry on another server. By default this redispatch occurs on the + * last retry, but if configured we allow redispatches to occur on + * configurable intervals, e.g. on every retry. In order to achieve this, + * we must mark the stream unassigned, and eventually clear the DIRECT + * bit to ignore any persistence cookie. We won't count a retry nor a + * redispatch yet, because this will depend on what server is selected. + * If the connection is not persistent, the balancing algorithm is not + * determinist (round robin) and there is more than one active server, + * we accept to perform an immediate redispatch without waiting since + * we don't care about this particular server. + */ + if (objt_server(s->target) && + (s->be->options & PR_O_REDISP) && !(s->flags & SF_FORCE_PRST) && + ((__objt_server(s->target)->cur_state < SRV_ST_RUNNING) || + (((s->be->redispatch_after > 0) && + (s->conn_retries % s->be->redispatch_after == 0)) || + ((s->be->redispatch_after < 0) && + (s->conn_retries % (s->be->conn_retries + 1 + s->be->redispatch_after) == 0))) || + (!(s->flags & SF_DIRECT) && s->be->srv_act > 1 && + ((s->be->lbprm.algo & BE_LB_KIND) != BE_LB_KIND_HI)))) { + sess_change_server(s, NULL); + if (may_dequeue_tasks(objt_server(s->target), s->be)) + process_srv_queue(objt_server(s->target)); + + sockaddr_free(&s->scb->dst); + s->flags &= ~(SF_DIRECT | SF_ASSIGNED); + s->scb->state = SC_ST_REQ; + } else { + if (objt_server(s->target)) + _HA_ATOMIC_INC(&__objt_server(s->target)->counters.retries); + _HA_ATOMIC_INC(&s->be->be_counters.retries); + s->scb->state = SC_ST_ASS; + } + +} + +/* + * This function only has to be called once after a wakeup event in case of + * suspected timeout. It controls the stream connection timeout and sets + * si->flags accordingly. It does NOT close anything, as this timeout may + * be used for any purpose. It returns 1 if the timeout fired, otherwise + * zero. + */ +static inline int stream_check_conn_timeout(struct stream *s) +{ + if (tick_is_expired(s->conn_exp, now_ms)) { + s->flags |= SF_CONN_EXP; + return 1; + } + return 0; +} + +int stream_set_timeout(struct stream *s, enum act_timeout_name name, int timeout); +void stream_retnclose(struct stream *s, const struct buffer *msg); +void sess_set_term_flags(struct stream *s); +void stream_abort(struct stream *s); + +void service_keywords_register(struct action_kw_list *kw_list); +struct action_kw *service_find(const char *kw); +void list_services(FILE *out); + +#endif /* _HAPROXY_STREAM_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/task-t.h b/include/haproxy/task-t.h new file mode 100644 index 0000000..ea52de9 --- /dev/null +++ b/include/haproxy/task-t.h @@ -0,0 +1,182 @@ +/* + * include/haproxy/task-t.h + * Macros, variables and structures for task management. + * + * Copyright (C) 2000-2010 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TASK_T_H +#define _HAPROXY_TASK_T_H + +#include <sys/time.h> + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/show_flags-t.h> +#include <haproxy/thread-t.h> + +/* values for task->state (32 bits). + * Please also update the task_show_state() function below in case of changes. + */ +#define TASK_SLEEPING 0x00000000 /* task sleeping */ +#define TASK_RUNNING 0x00000001 /* the task is currently running */ +/* unused 0x00000002 */ +#define TASK_QUEUED 0x00000004 /* The task has been (re-)added to the run queue */ +/* unused 0x00000008 */ +#define TASK_SELF_WAKING 0x00000010 /* task/tasklet found waking itself */ +#define TASK_KILLED 0x00000020 /* task/tasklet killed, may now be freed */ +#define TASK_IN_LIST 0x00000040 /* tasklet is in a tasklet list */ +#define TASK_HEAVY 0x00000080 /* this task/tasklet is extremely heavy */ + +#define TASK_WOKEN_INIT 0x00000100 /* woken up for initialisation purposes */ +#define TASK_WOKEN_TIMER 0x00000200 /* woken up because of expired timer */ +#define TASK_WOKEN_IO 0x00000400 /* woken up because of completed I/O */ +#define TASK_WOKEN_SIGNAL 0x00000800 /* woken up by a system signal */ +#define TASK_WOKEN_MSG 0x00001000 /* woken up by another task's message */ +#define TASK_WOKEN_RES 0x00002000 /* woken up because of available resource */ +#define TASK_WOKEN_OTHER 0x00004000 /* woken up for an unspecified reason */ + +/* use this to check a task state or to clean it up before queueing */ +#define TASK_WOKEN_ANY (TASK_WOKEN_OTHER|TASK_WOKEN_INIT|TASK_WOKEN_TIMER| \ + TASK_WOKEN_IO|TASK_WOKEN_SIGNAL|TASK_WOKEN_MSG| \ + TASK_WOKEN_RES) + +#define TASK_F_TASKLET 0x00008000 /* nature of this task: 0=task 1=tasklet */ +#define TASK_F_USR1 0x00010000 /* preserved user flag 1, application-specific, def:0 */ +/* unused: 0x20000..0x80000000 */ + +/* These flags are persistent across scheduler calls */ +#define TASK_PERSISTENT (TASK_SELF_WAKING | TASK_KILLED | \ + TASK_HEAVY | TASK_F_TASKLET | TASK_F_USR1) + +/* This function is used to report state in debugging tools. Please reflect + * below any single-bit flag addition above in the same order via the + * __APPEND_FLAG macro. The new end of the buffer is returned. + */ +static forceinline char *task_show_state(char *buf, size_t len, const char *delim, uint flg) +{ +#define _(f, ...) __APPEND_FLAG(buf, len, delim, flg, f, #f, __VA_ARGS__) + /* prologue */ + _(0); + /* flags */ + _(TASK_RUNNING, _(TASK_QUEUED, _(TASK_SELF_WAKING, + _(TASK_KILLED, _(TASK_IN_LIST, _(TASK_HEAVY, _(TASK_WOKEN_INIT, + _(TASK_WOKEN_TIMER, _(TASK_WOKEN_IO, _(TASK_WOKEN_SIGNAL, + _(TASK_WOKEN_MSG, _(TASK_WOKEN_RES, _(TASK_WOKEN_OTHER, + _(TASK_F_TASKLET, _(TASK_F_USR1))))))))))))))); + /* epilogue */ + _(~0U); + return buf; +#undef _ +} + +/* these wakeup types are used to indicate how a task/tasklet was woken up, for + * debugging purposes. + */ +enum { + WAKEUP_TYPE_UNSET = 0, + WAKEUP_TYPE_TASK_WAKEUP, + WAKEUP_TYPE_TASK_INSTANT_WAKEUP, + WAKEUP_TYPE_TASKLET_WAKEUP, + WAKEUP_TYPE_TASKLET_WAKEUP_AFTER, + WAKEUP_TYPE_TASK_SCHEDULE, + WAKEUP_TYPE_TASK_QUEUE, + WAKEUP_TYPE_APPCTX_WAKEUP, +}; + +struct notification { + struct list purge_me; /* Part of the list of signals to be purged in the + case of the LUA execution stack crash. */ + struct list wake_me; /* Part of list of signals to be targeted if an + event occurs. */ + struct task *task; /* The task to be wake if an event occurs. */ + __decl_thread(HA_SPINLOCK_T lock); +}; + +#ifdef DEBUG_TASK +/* prev_caller keeps a copy of the previous value of the <caller> field. */ +#define TASK_DEBUG_STORAGE \ + struct { \ + const struct ha_caller *prev_caller; \ + } debug +#else +#define TASK_DEBUG_STORAGE +#endif + +/* This part is common between struct task and struct tasklet so that tasks + * can be used as-is as tasklets. + * + * Note that the process() function must ALWAYS return the task/tasklet's + * pointer if the task/tasklet remains valid, and return NULL if it has been + * deleted. The scheduler relies on this to know if it should update its state + * on return. + */ +#define TASK_COMMON \ + struct { \ + unsigned int state; /* task state : bitfield of TASK_ */ \ + int tid; /* tid of task/tasklet. <0 = local for tasklet, unbound for task */ \ + struct task *(*process)(struct task *t, void *ctx, unsigned int state); /* the function which processes the task */ \ + void *context; /* the task's context */ \ + const struct ha_caller *caller; /* call place of last wakeup(); 0 on init, -1 on free */ \ + uint32_t wake_date; /* date of the last task wakeup */ \ + unsigned int calls; /* number of times process was called */ \ + TASK_DEBUG_STORAGE; \ + } + +/* The base for all tasks */ +struct task { + TASK_COMMON; /* must be at the beginning! */ + struct eb32_node rq; /* ebtree node used to hold the task in the run queue */ + /* WARNING: the struct task is often aliased as a struct tasklet when + * it is NOT in the run queue. The tasklet has its struct list here + * where rq starts and this works because both are exclusive. Never + * ever reorder these fields without taking this into account! + */ + struct eb32_node wq; /* ebtree node used to hold the task in the wait queue */ + int expire; /* next expiration date for this task, in ticks */ + short nice; /* task prio from -1024 to +1024 */ + /* 16-bit hole here */ +}; + +/* lightweight tasks, without priority, mainly used for I/Os */ +struct tasklet { + TASK_COMMON; /* must be at the beginning! */ + struct list list; + /* WARNING: the struct task is often aliased as a struct tasklet when + * it is not in the run queue. The task has its struct rq here where + * list starts and this works because both are exclusive. Never ever + * reorder these fields without taking this into account! + */ +}; + +/* + * The task callback (->process) is responsible for updating ->expire. It must + * return a pointer to the task itself, except if the task has been deleted, in + * which case it returns NULL so that the scheduler knows it must not check the + * expire timer. The scheduler will requeue the task at the proper location. + */ + + +#endif /* _HAPROXY_TASK_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/task.h b/include/haproxy/task.h new file mode 100644 index 0000000..1c9c45f --- /dev/null +++ b/include/haproxy/task.h @@ -0,0 +1,857 @@ +/* + * include/haproxy/task.h + * Functions for task management. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TASK_H +#define _HAPROXY_TASK_H + + +#include <sys/time.h> + +#include <import/eb32tree.h> + +#include <haproxy/activity.h> +#include <haproxy/api.h> +#include <haproxy/clock.h> +#include <haproxy/fd.h> +#include <haproxy/global.h> +#include <haproxy/intops.h> +#include <haproxy/list.h> +#include <haproxy/pool.h> +#include <haproxy/task-t.h> +#include <haproxy/thread.h> +#include <haproxy/ticks.h> + + +/* Principle of the wait queue. + * + * We want to be able to tell whether an expiration date is before of after the + * current time <now>. We KNOW that expiration dates are never too far apart, + * because they are measured in ticks (milliseconds). We also know that almost + * all dates will be in the future, and that a very small part of them will be + * in the past, they are the ones which have expired since last time we checked + * them. Using ticks, we know if a date is in the future or in the past, but we + * cannot use that to store sorted information because that reference changes + * all the time. + * + * We'll use the fact that the time wraps to sort timers. Timers above <now> + * are in the future, timers below <now> are in the past. Here, "above" and + * "below" are to be considered modulo 2^31. + * + * Timers are stored sorted in an ebtree. We use the new ability for ebtrees to + * lookup values starting from X to only expire tasks between <now> - 2^31 and + * <now>. If the end of the tree is reached while walking over it, we simply + * loop back to the beginning. That way, we have no problem keeping sorted + * wrapping timers in a tree, between (now - 24 days) and (now + 24 days). The + * keys in the tree always reflect their real position, none can be infinite. + * This reduces the number of checks to be performed. + * + * Another nice optimisation is to allow a timer to stay at an old place in the + * queue as long as it's not further than the real expiration date. That way, + * we use the tree as a place holder for a minorant of the real expiration + * date. Since we have a very low chance of hitting a timeout anyway, we can + * bounce the nodes to their right place when we scan the tree if we encounter + * a misplaced node once in a while. This even allows us not to remove the + * infinite timers from the wait queue. + * + * So, to summarize, we have : + * - node->key always defines current position in the wait queue + * - timer is the real expiration date (possibly infinite) + * - node->key is always before or equal to timer + * + * The run queue works similarly to the wait queue except that the current date + * is replaced by an insertion counter which can also wrap without any problem. + */ + +/* The farthest we can look back in a timer tree */ +#define TIMER_LOOK_BACK (1U << 31) + +/* tasklets are recognized with nice==-32768 */ +#define TASK_IS_TASKLET(t) ((t)->state & TASK_F_TASKLET) + +/* a few exported variables */ +extern struct pool_head *pool_head_task; +extern struct pool_head *pool_head_tasklet; +extern struct pool_head *pool_head_notification; + +__decl_thread(extern HA_RWLOCK_T wq_lock THREAD_ALIGNED(64)); + +void __tasklet_wakeup_on(struct tasklet *tl, int thr); +struct list *__tasklet_wakeup_after(struct list *head, struct tasklet *tl); +void task_kill(struct task *t); +void tasklet_kill(struct tasklet *t); +void __task_wakeup(struct task *t); +void __task_queue(struct task *task, struct eb_root *wq); + +unsigned int run_tasks_from_lists(unsigned int budgets[]); + +/* + * This does 3 things : + * - wake up all expired tasks + * - call all runnable tasks + * - return the date of next event in <next> or eternity. + */ + +void process_runnable_tasks(void); + +/* + * Extract all expired timers from the timer queue, and wakes up all + * associated tasks. + */ +void wake_expired_tasks(void); + +/* Checks the next timer for the current thread by looking into its own timer + * list and the global one. It may return TICK_ETERNITY if no timer is present. + * Note that the next timer might very well be slightly in the past. + */ +int next_timer_expiry(void); + +/* + * Delete every tasks before running the master polling loop + */ +void mworker_cleantasks(void); + +/* returns the number of running tasks+tasklets on the whole process. Note + * that this *is* racy since a task may move from the global to a local + * queue for example and be counted twice. This is only for statistics + * reporting. + */ +static inline int total_run_queues() +{ + int thr, ret = 0; + + for (thr = 0; thr < global.nbthread; thr++) + ret += _HA_ATOMIC_LOAD(&ha_thread_ctx[thr].rq_total); + return ret; +} + +/* returns the number of allocated tasks across all threads. Note that this + * *is* racy since some threads might be updating their counts while we're + * looking, but this is only for statistics reporting. + */ +static inline int total_allocated_tasks() +{ + int thr, ret; + + for (thr = ret = 0; thr < global.nbthread; thr++) + ret += _HA_ATOMIC_LOAD(&ha_thread_ctx[thr].nb_tasks); + return ret; +} + +/* returns the number of running niced tasks+tasklets on the whole process. + * Note that this *is* racy since a task may move from the global to a local + * queue for example and be counted twice. This is only for statistics + * reporting. + */ +static inline int total_niced_running_tasks() +{ + int tgrp, ret = 0; + + for (tgrp = 0; tgrp < global.nbtgroups; tgrp++) + ret += _HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp].niced_tasks); + return ret; +} + +/* return 0 if task is in run queue, otherwise non-zero */ +static inline int task_in_rq(struct task *t) +{ + /* Check if leaf_p is NULL, in case he's not in the runqueue, and if + * it's not 0x1, which would mean it's in the tasklet list. + */ + return t->rq.node.leaf_p != NULL; +} + +/* return 0 if task is in wait queue, otherwise non-zero */ +static inline int task_in_wq(struct task *t) +{ + return t->wq.node.leaf_p != NULL; +} + +/* returns true if the current thread has some work to do */ +static inline int thread_has_tasks(void) +{ + return ((int)!eb_is_empty(&th_ctx->rqueue) | + (int)!eb_is_empty(&th_ctx->rqueue_shared) | + (int)!!th_ctx->tl_class_mask | + (int)!MT_LIST_ISEMPTY(&th_ctx->shared_tasklet_list)); +} + +/* puts the task <t> in run queue with reason flags <f>, and returns <t> */ +/* This will put the task in the local runqueue if the task is only runnable + * by the current thread, in the global runqueue otherwies. With DEBUG_TASK, + * the <file>:<line> from the call place are stored into the task for tracing + * purposes. + */ +#define task_wakeup(t, f) \ + _task_wakeup(t, f, MK_CALLER(WAKEUP_TYPE_TASK_WAKEUP, 0, 0)) + +static inline void _task_wakeup(struct task *t, unsigned int f, const struct ha_caller *caller) +{ + unsigned int state; + + state = _HA_ATOMIC_OR_FETCH(&t->state, f); + while (!(state & (TASK_RUNNING | TASK_QUEUED))) { + if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_QUEUED)) { + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&t->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&t->debug.prev_caller, caller); +#endif + } + __task_wakeup(t); + break; + } + } +} + +/* Atomically drop the TASK_RUNNING bit while ensuring that any wakeup that + * happened since the flag was set will result in the task being queued (if + * it wasn't already). This is used to safely drop the flag from within the + * scheduler. The flag <f> is combined with existing flags before the test so + * that it's possible to unconditionally wakeup the task and drop the RUNNING + * flag if needed. + */ +static inline void task_drop_running(struct task *t, unsigned int f) +{ + unsigned int state, new_state; + + state = _HA_ATOMIC_LOAD(&t->state); + + while (1) { + new_state = state | f; + if (new_state & TASK_WOKEN_ANY) + new_state |= TASK_QUEUED; + + if (_HA_ATOMIC_CAS(&t->state, &state, new_state & ~TASK_RUNNING)) + break; + __ha_cpu_relax(); + } + + if ((new_state & ~state) & TASK_QUEUED) + __task_wakeup(t); +} + +/* + * Unlink the task from the wait queue, and possibly update the last_timer + * pointer. A pointer to the task itself is returned. The task *must* already + * be in the wait queue before calling this function. If unsure, use the safer + * task_unlink_wq() function. + */ +static inline struct task *__task_unlink_wq(struct task *t) +{ + eb32_delete(&t->wq); + return t; +} + +/* remove a task from its wait queue. It may either be the local wait queue if + * the task is bound to a single thread or the global queue. If the task uses a + * shared wait queue, the global wait queue lock is used. + */ +static inline struct task *task_unlink_wq(struct task *t) +{ + unsigned long locked; + + if (likely(task_in_wq(t))) { + locked = t->tid < 0; + BUG_ON(t->tid >= 0 && t->tid != tid && !(global.mode & MODE_STOPPING)); + if (locked) + HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock); + __task_unlink_wq(t); + if (locked) + HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock); + } + return t; +} + +/* Place <task> into the wait queue, where it may already be. If the expiration + * timer is infinite, do nothing and rely on wake_expired_task to clean up. + * If the task uses a shared wait queue, it's queued into the global wait queue, + * protected by the global wq_lock, otherwise by it necessarily belongs to the + * current thread'sand is queued without locking. + */ +#define task_queue(t) \ + _task_queue(t, MK_CALLER(WAKEUP_TYPE_TASK_QUEUE, 0, 0)) + +static inline void _task_queue(struct task *task, const struct ha_caller *caller) +{ + /* If we already have a place in the wait queue no later than the + * timeout we're trying to set, we'll stay there, because it is very + * unlikely that we will reach the timeout anyway. If the timeout + * has been disabled, it's useless to leave the queue as well. We'll + * rely on wake_expired_tasks() to catch the node and move it to the + * proper place should it ever happen. Finally we only add the task + * to the queue if it was not there or if it was further than what + * we want. + */ + if (!tick_isset(task->expire)) + return; + +#ifdef USE_THREAD + if (task->tid < 0) { + HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock); + if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) { + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&task->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&task->debug.prev_caller, caller); +#endif + } + __task_queue(task, &tg_ctx->timers); + } + HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock); + } else +#endif + { + BUG_ON(task->tid != tid); + if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) { + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&task->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&task->debug.prev_caller, caller); +#endif + } + __task_queue(task, &th_ctx->timers); + } + } +} + +/* Change the thread affinity of a task to <thr>, which may either be a valid + * thread number from 0 to nbthread-1, or a negative value to allow the task + * to run on any thread. + * + * This may only be done from within the running task itself or during its + * initialization. It will unqueue and requeue the task from the wait queue + * if it was in it. This is safe against a concurrent task_queue() call because + * task_queue() itself will unlink again if needed after taking into account + * the new thread_mask. + */ +static inline void task_set_thread(struct task *t, int thr) +{ +#ifndef USE_THREAD + /* no shared queue without threads */ + thr = 0; +#endif + if (unlikely(task_in_wq(t))) { + task_unlink_wq(t); + t->tid = thr; + task_queue(t); + } + else { + t->tid = thr; + } +} + +/* schedules tasklet <tl> to run onto thread <thr> or the current thread if + * <thr> is negative. Note that it is illegal to wakeup a foreign tasklet if + * its tid is negative and it is illegal to self-assign a tasklet that was + * at least once scheduled on a specific thread. With DEBUG_TASK, the + * <file>:<line> from the call place are stored into the tasklet for tracing + * purposes. + */ +#define tasklet_wakeup_on(tl, thr) \ + _tasklet_wakeup_on(tl, thr, MK_CALLER(WAKEUP_TYPE_TASKLET_WAKEUP, 0, 0)) + +static inline void _tasklet_wakeup_on(struct tasklet *tl, int thr, const struct ha_caller *caller) +{ + unsigned int state = tl->state; + + do { + /* do nothing if someone else already added it */ + if (state & TASK_IN_LIST) + return; + } while (!_HA_ATOMIC_CAS(&tl->state, &state, state | TASK_IN_LIST)); + + /* at this point we're the first ones to add this task to the list */ + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&tl->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&tl->debug.prev_caller, caller); +#endif + } + + if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING) + tl->wake_date = now_mono_time(); + __tasklet_wakeup_on(tl, thr); +} + +/* schedules tasklet <tl> to run onto the thread designated by tl->tid, which + * is either its owner thread if >= 0 or the current thread if < 0. When + * DEBUG_TASK is set, the <file>:<line> from the call place are stored into the + * task for tracing purposes. + */ +#define tasklet_wakeup(tl) \ + _tasklet_wakeup_on(tl, (tl)->tid, MK_CALLER(WAKEUP_TYPE_TASKLET_WAKEUP, 0, 0)) + +/* instantly wakes up task <t> on its owner thread even if it's not the current + * one, bypassing the run queue. The purpose is to be able to avoid contention + * in the global run queue for massively remote tasks (e.g. queue) when there's + * no value in passing the task again through the priority ordering since it has + * already been subject to it once (e.g. before entering process_stream). The + * task goes directly into the shared mt_list as a tasklet and will run as + * TL_URGENT. Great care is taken to be certain it's not queued nor running + * already. + */ +#define task_instant_wakeup(t, f) \ + _task_instant_wakeup(t, f, MK_CALLER(WAKEUP_TYPE_TASK_INSTANT_WAKEUP, 0, 0)) + +static inline void _task_instant_wakeup(struct task *t, unsigned int f, const struct ha_caller *caller) +{ + int thr = t->tid; + unsigned int state; + + if (thr < 0) + thr = tid; + + /* first, let's update the task's state with the wakeup condition */ + state = _HA_ATOMIC_OR_FETCH(&t->state, f); + + /* next we need to make sure the task was not/will not be added to the + * run queue because the tasklet list's mt_list uses the same storage + * as the task's run_queue. + */ + do { + /* do nothing if someone else already added it */ + if (state & (TASK_QUEUED|TASK_RUNNING)) + return; + } while (!_HA_ATOMIC_CAS(&t->state, &state, state | TASK_QUEUED)); + + BUG_ON_HOT(task_in_rq(t)); + + /* at this point we're the first ones to add this task to the list */ + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&t->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&t->debug.prev_caller, caller); +#endif + } + + if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING) + t->wake_date = now_mono_time(); + __tasklet_wakeup_on((struct tasklet *)t, thr); +} + +/* schedules tasklet <tl> to run immediately after the current one is done + * <tl> will be queued after entry <head>, or at the head of the task list. Return + * the new head to be used to queue future tasks. This is used to insert multiple entries + * at the head of the tasklet list, typically to transfer processing from a tasklet + * to another one or a set of other ones. If <head> is NULL, the tasklet list of <thr> + * thread will be used. + * With DEBUG_TASK, the <file>:<line> from the call place are stored into the tasklet + * for tracing purposes. + */ +#define tasklet_wakeup_after(head, tl) \ + _tasklet_wakeup_after(head, tl, MK_CALLER(WAKEUP_TYPE_TASKLET_WAKEUP_AFTER, 0, 0)) + +static inline struct list *_tasklet_wakeup_after(struct list *head, struct tasklet *tl, + const struct ha_caller *caller) +{ + unsigned int state = tl->state; + + do { + /* do nothing if someone else already added it */ + if (state & TASK_IN_LIST) + return head; + } while (!_HA_ATOMIC_CAS(&tl->state, &state, state | TASK_IN_LIST)); + + /* at this point we're the first one to add this task to the list */ + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&tl->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&tl->debug.prev_caller, caller); +#endif + } + + if (th_ctx->flags & TH_FL_TASK_PROFILING) + tl->wake_date = now_mono_time(); + return __tasklet_wakeup_after(head, tl); +} + +/* This macro shows the current function name and the last known caller of the + * task (or tasklet) wakeup. + */ +#ifdef DEBUG_TASK +#define DEBUG_TASK_PRINT_CALLER(t) do { \ + const struct ha_caller *__caller = (t)->caller; \ + printf("%s woken up from %s(%s:%d)\n", __FUNCTION__, \ + __caller ? __caller->func : NULL, \ + __caller ? __caller->file : NULL, \ + __caller ? __caller->line : 0); \ +} while (0) +#else +#define DEBUG_TASK_PRINT_CALLER(t) do { } while (0) +#endif + + +/* Try to remove a tasklet from the list. This call is inherently racy and may + * only be performed on the thread that was supposed to dequeue this tasklet. + * This way it is safe to call MT_LIST_DELETE without first removing the + * TASK_IN_LIST bit, which must absolutely be removed afterwards in case + * another thread would want to wake this tasklet up in parallel. + */ +static inline void tasklet_remove_from_tasklet_list(struct tasklet *t) +{ + if (MT_LIST_DELETE(list_to_mt_list(&t->list))) { + _HA_ATOMIC_AND(&t->state, ~TASK_IN_LIST); + _HA_ATOMIC_DEC(&ha_thread_ctx[t->tid >= 0 ? t->tid : tid].rq_total); + } +} + +/* + * Initialize a new task. The bare minimum is performed (queue pointers and + * state). The task is returned. This function should not be used outside of + * task_new(). If the thread ID is < 0, the task may run on any thread. + */ +static inline struct task *task_init(struct task *t, int tid) +{ + t->wq.node.leaf_p = NULL; + t->rq.node.leaf_p = NULL; + t->state = TASK_SLEEPING; +#ifndef USE_THREAD + /* no shared wq without threads */ + tid = 0; +#endif + t->tid = tid; + t->nice = 0; + t->calls = 0; + t->wake_date = 0; + t->expire = TICK_ETERNITY; + t->caller = NULL; + return t; +} + +/* Initialize a new tasklet. It's identified as a tasklet by its flags + * TASK_F_TASKLET. It is expected to run on the calling thread by default, + * it's up to the caller to change ->tid if it wants to own it. + */ +static inline void tasklet_init(struct tasklet *t) +{ + t->calls = 0; + t->state = TASK_F_TASKLET; + t->process = NULL; + t->tid = -1; + t->wake_date = 0; + t->caller = NULL; + LIST_INIT(&t->list); +} + +/* Allocate and initialize a new tasklet, local to the thread by default. The + * caller may assign its tid if it wants to own the tasklet. + */ +static inline struct tasklet *tasklet_new(void) +{ + struct tasklet *t = pool_alloc(pool_head_tasklet); + + if (t) { + tasklet_init(t); + } + return t; +} + +/* + * Allocate and initialize a new task, to run on global thread <thr>, or any + * thread if negative. The task count is incremented. The new task is returned, + * or NULL in case of lack of memory. It's up to the caller to pass a valid + * thread number (in tid space, 0 to nbthread-1, or <0 for any). Tasks created + * this way must be freed using task_destroy(). + */ +static inline struct task *task_new_on(int thr) +{ + struct task *t = pool_alloc(pool_head_task); + if (t) { + th_ctx->nb_tasks++; + task_init(t, thr); + } + return t; +} + +/* Allocate and initialize a new task, to run on the calling thread. The new + * task is returned, or NULL in case of lack of memory. The task count is + * incremented. + */ +static inline struct task *task_new_here() +{ + return task_new_on(tid); +} + +/* Allocate and initialize a new task, to run on any thread. The new task is + * returned, or NULL in case of lack of memory. The task count is incremented. + */ +static inline struct task *task_new_anywhere() +{ + return task_new_on(-1); +} + +/* + * Free a task. Its context must have been freed since it will be lost. The + * task count is decremented. It it is the current task, this one is reset. + */ +static inline void __task_free(struct task *t) +{ + if (t == th_ctx->current) { + th_ctx->current = NULL; + __ha_barrier_store(); + } + BUG_ON(task_in_wq(t) || task_in_rq(t)); + + BUG_ON((ulong)t->caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&t->debug.prev_caller, HA_ATOMIC_LOAD(&t->caller)); +#endif + HA_ATOMIC_STORE(&t->caller, (void*)1); // make sure to crash if used after free + + pool_free(pool_head_task, t); + th_ctx->nb_tasks--; + if (unlikely(stopping)) + pool_flush(pool_head_task); +} + +/* Destroys a task : it's unlinked from the wait queues and is freed if it's + * the current task or not queued otherwise it's marked to be freed by the + * scheduler. It does nothing if <t> is NULL. + */ +static inline void task_destroy(struct task *t) +{ + if (!t) + return; + + task_unlink_wq(t); + /* We don't have to explicitly remove from the run queue. + * If we are in the runqueue, the test below will set t->process + * to NULL, and the task will be free'd when it'll be its turn + * to run. + */ + + /* There's no need to protect t->state with a lock, as the task + * has to run on the current thread. + */ + if (t == th_ctx->current || !(t->state & (TASK_QUEUED | TASK_RUNNING))) + __task_free(t); + else + t->process = NULL; +} + +/* Should only be called by the thread responsible for the tasklet */ +static inline void tasklet_free(struct tasklet *tl) +{ + if (!tl) + return; + + if (MT_LIST_DELETE(list_to_mt_list(&tl->list))) + _HA_ATOMIC_DEC(&ha_thread_ctx[tl->tid >= 0 ? tl->tid : tid].rq_total); + + BUG_ON((ulong)tl->caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&tl->debug.prev_caller, HA_ATOMIC_LOAD(&tl->caller)); +#endif + HA_ATOMIC_STORE(&tl->caller, (void*)1); // make sure to crash if used after free + pool_free(pool_head_tasklet, tl); + if (unlikely(stopping)) + pool_flush(pool_head_tasklet); +} + +static inline void tasklet_set_tid(struct tasklet *tl, int tid) +{ + tl->tid = tid; +} + +/* Ensure <task> will be woken up at most at <when>. If the task is already in + * the run queue (but not running), nothing is done. It may be used that way + * with a delay : task_schedule(task, tick_add(now_ms, delay)); + * It MUST NOT be used with a timer in the past, and even less with + * TICK_ETERNITY (which would block all timers). Note that passing it directly + * now_ms without using tick_add() will definitely make this happen once every + * 49.7 days. + */ +#define task_schedule(t, w) \ + _task_schedule(t, w, MK_CALLER(WAKEUP_TYPE_TASK_SCHEDULE, 0, 0)) + +static inline void _task_schedule(struct task *task, int when, const struct ha_caller *caller) +{ + /* TODO: mthread, check if there is no tisk with this test */ + if (task_in_rq(task)) + return; + +#ifdef USE_THREAD + if (task->tid < 0) { + /* FIXME: is it really needed to lock the WQ during the check ? */ + HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock); + if (task_in_wq(task)) + when = tick_first(when, task->expire); + + task->expire = when; + if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) { + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&task->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&task->debug.prev_caller, caller); +#endif + } + __task_queue(task, &tg_ctx->timers); + } + HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock); + } else +#endif + { + BUG_ON(task->tid != tid); + if (task_in_wq(task)) + when = tick_first(when, task->expire); + + task->expire = when; + if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) { + if (likely(caller)) { + caller = HA_ATOMIC_XCHG(&task->caller, caller); + BUG_ON((ulong)caller & 1); +#ifdef DEBUG_TASK + HA_ATOMIC_STORE(&task->debug.prev_caller, caller); +#endif + } + __task_queue(task, &th_ctx->timers); + } + } +} + +/* returns the string corresponding to a task type as found in the task caller + * locations. + */ +static inline const char *task_wakeup_type_str(uint t) +{ + switch (t) { + case WAKEUP_TYPE_TASK_WAKEUP : return "task_wakeup"; + case WAKEUP_TYPE_TASK_INSTANT_WAKEUP : return "task_instant_wakeup"; + case WAKEUP_TYPE_TASKLET_WAKEUP : return "tasklet_wakeup"; + case WAKEUP_TYPE_TASKLET_WAKEUP_AFTER : return "tasklet_wakeup_after"; + case WAKEUP_TYPE_TASK_QUEUE : return "task_queue"; + case WAKEUP_TYPE_TASK_SCHEDULE : return "task_schedule"; + case WAKEUP_TYPE_APPCTX_WAKEUP : return "appctx_wakeup"; + default : return "?"; + } +} + +/* This function register a new signal. "lua" is the current lua + * execution context. It contains a pointer to the associated task. + * "link" is a list head attached to an other task that must be wake + * the lua task if an event occurs. This is useful with external + * events like TCP I/O or sleep functions. This function allocate + * memory for the signal. + */ +static inline struct notification *notification_new(struct list *purge, struct list *event, struct task *wakeup) +{ + struct notification *com = pool_alloc(pool_head_notification); + if (!com) + return NULL; + LIST_APPEND(purge, &com->purge_me); + LIST_APPEND(event, &com->wake_me); + HA_SPIN_INIT(&com->lock); + com->task = wakeup; + return com; +} + +/* This function purge all the pending signals when the LUA execution + * is finished. This prevent than a coprocess try to wake a deleted + * task. This function remove the memory associated to the signal. + * The purge list is not locked because it is owned by only one + * process. before browsing this list, the caller must ensure to be + * the only one browser. + */ +static inline void notification_purge(struct list *purge) +{ + struct notification *com, *back; + + /* Delete all pending communication signals. */ + list_for_each_entry_safe(com, back, purge, purge_me) { + HA_SPIN_LOCK(NOTIF_LOCK, &com->lock); + LIST_DELETE(&com->purge_me); + if (!com->task) { + HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock); + pool_free(pool_head_notification, com); + continue; + } + com->task = NULL; + HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock); + } +} + +/* In some cases, the disconnected notifications must be cleared. + * This function just release memory blocks. The purge list is not + * locked because it is owned by only one process. Before browsing + * this list, the caller must ensure to be the only one browser. + * The "com" is not locked because when com->task is NULL, the + * notification is no longer used. + */ +static inline void notification_gc(struct list *purge) +{ + struct notification *com, *back; + + /* Delete all pending communication signals. */ + list_for_each_entry_safe (com, back, purge, purge_me) { + if (com->task) + continue; + LIST_DELETE(&com->purge_me); + pool_free(pool_head_notification, com); + } +} + +/* This function sends signals. It wakes all the tasks attached + * to a list head, and remove the signal, and free the used + * memory. The wake list is not locked because it is owned by + * only one process. before browsing this list, the caller must + * ensure to be the only one browser. + */ +static inline void notification_wake(struct list *wake) +{ + struct notification *com, *back; + + /* Wake task and delete all pending communication signals. */ + list_for_each_entry_safe(com, back, wake, wake_me) { + HA_SPIN_LOCK(NOTIF_LOCK, &com->lock); + LIST_DELETE(&com->wake_me); + if (!com->task) { + HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock); + pool_free(pool_head_notification, com); + continue; + } + task_wakeup(com->task, TASK_WOKEN_MSG); + com->task = NULL; + HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock); + } +} + +/* This function returns true is some notification are pending + */ +static inline int notification_registered(struct list *wake) +{ + return !LIST_ISEMPTY(wake); +} + +#endif /* _HAPROXY_TASK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/tcp_rules.h b/include/haproxy/tcp_rules.h new file mode 100644 index 0000000..2ed515e --- /dev/null +++ b/include/haproxy/tcp_rules.h @@ -0,0 +1,52 @@ +/* + * include/haproxy/tcp_rules.h + * This file contains "tcp" rules definitions + * + * Copyright (C) 2000-2016 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TCP_RULES_H +#define _HAPROXY_TCP_RULES_H + +#include <haproxy/action-t.h> +#include <haproxy/api.h> +#include <haproxy/session-t.h> +#include <haproxy/stream-t.h> + +int tcp_inspect_request(struct stream *s, struct channel *req, int an_bit); +int tcp_inspect_response(struct stream *s, struct channel *rep, int an_bit); +int tcp_exec_l4_rules(struct session *sess); +int tcp_exec_l5_rules(struct session *sess); + +void tcp_req_conn_keywords_register(struct action_kw_list *kw_list); +void tcp_req_sess_keywords_register(struct action_kw_list *kw_list); +void tcp_req_cont_keywords_register(struct action_kw_list *kw_list); +void tcp_res_cont_keywords_register(struct action_kw_list *kw_list); + +struct action_kw *tcp_req_conn_action(const char *kw); +struct action_kw *tcp_req_sess_action(const char *kw); +struct action_kw *tcp_req_cont_action(const char *kw); +struct action_kw *tcp_res_cont_action(const char *kw); + +#endif /* _HAPROXY_TCP_RULES_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/tcpcheck-t.h b/include/haproxy/tcpcheck-t.h new file mode 100644 index 0000000..8878995 --- /dev/null +++ b/include/haproxy/tcpcheck-t.h @@ -0,0 +1,242 @@ +/* + * include/haproxy/tcpcheck-t.h + * TCP check definitions, enums, macros and bitfields. + * + * Copyright 2000-2009,2020 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * Copyright 2013 Baptiste Assmann <bedis9@gmail.com> + * Copyright 2020 Gaetan Rivet <grive@u256.net> + * Copyright 2020 Christopher Faulet <cfaulet@haproxy.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_TCPCHECK_T_H +#define _HAPROXY_TCPCHECK_T_H + +#include <import/ebtree-t.h> +#include <import/ist.h> +#include <haproxy/api-t.h> +#include <haproxy/buf-t.h> +#include <haproxy/check-t.h> +#include <haproxy/connection-t.h> +#include <haproxy/obj_type-t.h> +#include <haproxy/vars-t.h> + +/* options for tcp-check connect */ +#define TCPCHK_OPT_NONE 0x0000 /* no options specified, default */ +#define TCPCHK_OPT_SEND_PROXY 0x0001 /* send proxy-protocol string */ +#define TCPCHK_OPT_SSL 0x0002 /* SSL connection */ +#define TCPCHK_OPT_LINGER 0x0004 /* Do not RST connection, let it linger */ +#define TCPCHK_OPT_DEFAULT_CONNECT 0x0008 /* Do a connect using server params */ +#define TCPCHK_OPT_IMPLICIT 0x0010 /* Implicit connect */ +#define TCPCHK_OPT_SOCKS4 0x0020 /* check the connection via socks4 proxy */ +#define TCPCHK_OPT_HAS_DATA 0x0040 /* data should be sent after connection */ + +enum tcpcheck_send_type { + TCPCHK_SEND_UNDEF = 0, /* Send is not parsed. */ + TCPCHK_SEND_STRING, /* Send an ASCII string. */ + TCPCHK_SEND_BINARY, /* Send a binary sequence. */ + TCPCHK_SEND_STRING_LF, /* Send an ASCII log-format string. */ + TCPCHK_SEND_BINARY_LF, /* Send a binary log-format sequence. */ + TCPCHK_SEND_HTTP, /* Send an HTTP request */ +}; + +/* flags for tcp-check send */ +#define TCPCHK_SND_HTTP_FL_URI_FMT 0x0001 /* Use a log-format string for the uri */ +#define TCPCHK_SND_HTTP_FL_BODY_FMT 0x0002 /* Use a log-format string for the body */ +#define TCPCHK_SND_HTTP_FROM_OPT 0x0004 /* Send rule coming from "option httpck" directive */ + +enum tcpcheck_eval_ret { + TCPCHK_EVAL_WAIT = 0, + TCPCHK_EVAL_STOP, + TCPCHK_EVAL_CONTINUE, +}; + +enum tcpcheck_expect_type { + TCPCHK_EXPECT_UNDEF = 0, /* Match is not used. */ + TCPCHK_EXPECT_STRING, /* Matches a string. */ + TCPCHK_EXPECT_STRING_REGEX, /* Matches a regular pattern. */ + TCPCHK_EXPECT_STRING_LF, /* Matches a log-format string. */ + TCPCHK_EXPECT_BINARY, /* Matches a binary sequence on a hex-encoded text. */ + TCPCHK_EXPECT_BINARY_REGEX, /* Matches a regular pattern on a hex-encoded text. */ + TCPCHK_EXPECT_BINARY_LF, /* Matches a log-format binary sequence on a hex-encoded text. */ + TCPCHK_EXPECT_CUSTOM, /* Execute a custom function. */ + TCPCHK_EXPECT_HTTP_STATUS, /* Matches a list of codes on the HTTP status */ + TCPCHK_EXPECT_HTTP_STATUS_REGEX, /* Matches a regular pattern on the HTTP status */ + TCPCHK_EXPECT_HTTP_HEADER, /* Matches on HTTP headers */ + TCPCHK_EXPECT_HTTP_BODY, /* Matches a string oa the HTTP payload */ + TCPCHK_EXPECT_HTTP_BODY_REGEX, /* Matches a regular pattern on a HTTP payload */ + TCPCHK_EXPECT_HTTP_BODY_LF, /* Matches a log-format string on the HTTP payload */ +}; + +/* tcp-check expect flags */ +#define TCPCHK_EXPT_FL_INV 0x0001 /* Matching is inversed */ +#define TCPCHK_EXPT_FL_HTTP_HNAME_STR 0x0002 /* Exact match on the HTTP header name */ +#define TCPCHK_EXPT_FL_HTTP_HNAME_BEG 0x0004 /* Prefix match on the HTTP header name */ +#define TCPCHK_EXPT_FL_HTTP_HNAME_END 0x0008 /* Suffix match on the HTTP header name */ +#define TCPCHK_EXPT_FL_HTTP_HNAME_SUB 0x0010 /* Substring match on the HTTP header name */ +#define TCPCHK_EXPT_FL_HTTP_HNAME_REG 0x0020 /* Regex match on the HTTP header name */ +#define TCPCHK_EXPT_FL_HTTP_HNAME_FMT 0x0040 /* The HTTP header name is a log-format string */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_NONE 0x0080 /* No match on the HTTP header value */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_STR 0x0100 /* Exact match on the HTTP header value */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_BEG 0x0200 /* Prefix match on the HTTP header value */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_END 0x0400 /* Suffix match on the HTTP header value */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_SUB 0x0800 /* Substring match on the HTTP header value */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_REG 0x1000 /* Regex match on the HTTP header value*/ +#define TCPCHK_EXPT_FL_HTTP_HVAL_FMT 0x2000 /* The HTTP header value is a log-format string */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_FULL 0x4000 /* Match the full header value ( no stop on commas ) */ + +#define TCPCHK_EXPT_FL_HTTP_HNAME_TYPE 0x003E /* Mask to get matching method on header name */ +#define TCPCHK_EXPT_FL_HTTP_HVAL_TYPE 0x1F00 /* Mask to get matching method on header value */ + +/* possible actions for tcpcheck_rule->action */ +enum tcpcheck_rule_type { + TCPCHK_ACT_SEND = 0, /* send action, regular string format */ + TCPCHK_ACT_EXPECT, /* expect action, either regular or binary string */ + TCPCHK_ACT_CONNECT, /* connect action, to probe a new port */ + TCPCHK_ACT_COMMENT, /* no action, simply a comment used for logs */ + TCPCHK_ACT_ACTION_KW, /* custom registered action_kw rule. */ +}; + +#define TCPCHK_RULES_NONE 0x00000000 +#define TCPCHK_RULES_UNUSED_TCP_RS 0x00000001 /* An unused tcp-check ruleset exists */ +#define TCPCHK_RULES_UNUSED_HTTP_RS 0x00000002 /* An unused http-check ruleset exists */ +#define TCPCHK_RULES_UNUSED_RS 0x00000003 /* Mask for unused ruleset */ + +#define TCPCHK_RULES_PGSQL_CHK 0x00000010 +#define TCPCHK_RULES_REDIS_CHK 0x00000020 +#define TCPCHK_RULES_SMTP_CHK 0x00000030 +#define TCPCHK_RULES_HTTP_CHK 0x00000040 +#define TCPCHK_RULES_MYSQL_CHK 0x00000050 +#define TCPCHK_RULES_LDAP_CHK 0x00000060 +#define TCPCHK_RULES_SSL3_CHK 0x00000070 +#define TCPCHK_RULES_AGENT_CHK 0x00000080 +#define TCPCHK_RULES_SPOP_CHK 0x00000090 +/* Unused 0x000000A0..0x00000FF0 (reserved for future proto) */ +#define TCPCHK_RULES_TCP_CHK 0x00000FF0 +#define TCPCHK_RULES_PROTO_CHK 0x00000FF0 /* Mask to cover protocol check */ + +struct check; +struct tcpcheck_connect { + char *sni; /* server name to use for SSL connections */ + char *alpn; /* ALPN to use for the SSL connection */ + int alpn_len; /* ALPN string length */ + const struct mux_proto_list *mux_proto; /* the mux to use for all outgoing connections (specified by the "proto" keyword) */ + uint16_t options; /* options when setting up a new connection */ + uint16_t port; /* port to connect to */ + struct sample_expr *port_expr; /* sample expr to determine the port, may be NULL */ + struct sockaddr_storage addr; /* the address to the connect */ +}; + +struct tcpcheck_http_hdr { + struct ist name; /* the header name */ + struct list value; /* the log-format string value */ + struct list list; /* header chained list */ +}; + +struct tcpcheck_codes { + unsigned int (*codes)[2]; /* an array of roange of codes: [0]=min [1]=max */ + size_t num; /* number of entry in the array */ +}; + +struct tcpcheck_send { + enum tcpcheck_send_type type; + union { + struct ist data; /* an ASCII string or a binary sequence */ + struct list fmt; /* an ASCII or hexa log-format string */ + struct { + unsigned int flags; /* TCPCHK_SND_HTTP_FL_* */ + struct http_meth meth; /* the HTTP request method */ + union { + struct ist uri; /* the HTTP request uri is a string */ + struct list uri_fmt; /* or a log-format string */ + }; + struct ist vsn; /* the HTTP request version string */ + struct list hdrs; /* the HTTP request header list */ + union { + struct ist body; /* the HTTP request payload is a string */ + struct list body_fmt; /* or a log-format string */ + }; + } http; /* Info about the HTTP request to send */ + }; +}; + +struct tcpcheck_expect { + enum tcpcheck_expect_type type; /* Type of pattern used for matching. */ + unsigned int flags; /* TCPCHK_EXPT_FL_* */ + union { + struct ist data; /* Matching a literal string / binary anywhere in the response. */ + struct my_regex *regex; /* Matching a regex pattern. */ + struct tcpcheck_codes codes; /* Matching a list of codes */ + struct list fmt; /* Matching a log-format string / binary */ + struct { + union { + struct ist name; + struct list name_fmt; + struct my_regex *name_re; + }; + union { + struct ist value; + struct list value_fmt; + struct my_regex *value_re; + }; + } hdr; /* Matching a header pattern */ + + + /* custom function to eval expect rule */ + enum tcpcheck_eval_ret (*custom)(struct check *, struct tcpcheck_rule *, int); + }; + struct tcpcheck_rule *head; /* first expect of a chain. */ + int min_recv; /* Minimum amount of data before an expect can be applied. (default: -1, ignored) */ + enum healthcheck_status ok_status; /* The healthcheck status to use on success (default: L7OKD) */ + enum healthcheck_status err_status; /* The healthcheck status to use on error (default: L7RSP) */ + enum healthcheck_status tout_status; /* The healthcheck status to use on timeout (default: L7TOUT) */ + struct list onerror_fmt; /* log-format string to use as comment on error */ + struct list onsuccess_fmt; /* log-format string to use as comment on success (if last rule) */ + struct sample_expr *status_expr; /* sample expr to determine the check status code */ +}; + +struct tcpcheck_action_kw { + struct act_rule *rule; +}; + +struct tcpcheck_rule { + struct list list; /* list linked to from the proxy */ + enum tcpcheck_rule_type action; /* type of the rule. */ + int index; /* Index within the list. Starts at 0. */ + char *comment; /* comment to be used in the logs and on the stats socket */ + union { + struct tcpcheck_connect connect; /* Connect rule. */ + struct tcpcheck_send send; /* Send rule. */ + struct tcpcheck_expect expect; /* Expected pattern. */ + struct tcpcheck_action_kw action_kw; /* Custom action. */ + }; +}; + +/* A list of tcp-check vars, to be registered before executing a ruleset */ +struct tcpcheck_var { + struct ist name; /* the variable name with the scope */ + struct sample_data data; /* the data associated to the variable */ + struct list list; /* element to chain tcp-check vars */ +}; + +/* a list of tcp-check rules */ +struct tcpcheck_rules { + unsigned int flags; /* flags applied to the rules */ + struct list *list; /* the list of tcpcheck_rules */ + struct list preset_vars; /* The list of variable to preset before executing the ruleset */ +}; + +/* A list of tcp-check rules with a name */ +struct tcpcheck_ruleset { + struct list rules; /* the list of tcpcheck_rule */ + struct ebpt_node node; /* node in the shared tree */ +}; + + +#endif /* _HAPROXY_CHECKS_T_H */ diff --git a/include/haproxy/tcpcheck.h b/include/haproxy/tcpcheck.h new file mode 100644 index 0000000..3abd1ef --- /dev/null +++ b/include/haproxy/tcpcheck.h @@ -0,0 +1,125 @@ +/* + * include/haproxy/tcpcheck.h + * Functions prototypes for the TCP checks. + * + * Copyright 2000-2009,2020 Willy Tarreau <w@1wt.eu> + * Copyright 2007-2010 Krzysztof Piotr Oledzki <ole@ans.pl> + * Copyright 2013 Baptiste Assmann <bedis9@gmail.com> + * Copyright 2020 Gaetan Rivet <grive@u256.net> + * Copyright 2020 Christopher Faulet <cfaulet@haproxy.com> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TCPCHECK_H +#define _HAPROXY_TCPCHECK_H + +#include <haproxy/action.h> +#include <haproxy/check-t.h> +#include <haproxy/pool-t.h> +#include <haproxy/proxy-t.h> +#include <haproxy/tcpcheck-t.h> + +extern struct action_kw_list tcp_check_keywords; +extern struct pool_head *pool_head_tcpcheck_rule; + +int tcpcheck_get_step_id(const struct check *check, const struct tcpcheck_rule *rule); +struct tcpcheck_rule *get_first_tcpcheck_rule(const struct tcpcheck_rules *rules); + +struct tcpcheck_ruleset *create_tcpcheck_ruleset(const char *name); +struct tcpcheck_ruleset *find_tcpcheck_ruleset(const char *name); +void free_tcpcheck_ruleset(struct tcpcheck_ruleset *rs); + +void free_tcpcheck(struct tcpcheck_rule *rule, int in_pool); +void deinit_proxy_tcpcheck(struct proxy *px); + +struct tcpcheck_var *create_tcpcheck_var(const struct ist name); +void free_tcpcheck_var(struct tcpcheck_var *var); +int dup_tcpcheck_vars(struct list *dst, const struct list *src); +void free_tcpcheck_vars(struct list *vars); + +int add_tcpcheck_expect_str(struct tcpcheck_rules *rules, const char *str); +int add_tcpcheck_send_strs(struct tcpcheck_rules *rules, const char * const *strs); +int tcpcheck_add_http_rule(struct tcpcheck_rule *chk, struct tcpcheck_rules *rules, char **errmsg); + +void free_tcpcheck_http_hdr(struct tcpcheck_http_hdr *hdr); + +enum tcpcheck_eval_ret tcpcheck_mysql_expect_iniths(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_mysql_expect_ok(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_ldap_expect_bindrsp(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_spop_expect_agenthello(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_agent_expect_reply(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_eval_connect(struct check *check, struct tcpcheck_rule *rule); +enum tcpcheck_eval_ret tcpcheck_eval_send(struct check *check, struct tcpcheck_rule *rule); +enum tcpcheck_eval_ret tcpcheck_eval_recv(struct check *check, struct tcpcheck_rule *rule); +enum tcpcheck_eval_ret tcpcheck_eval_expect_http(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_eval_expect(struct check *check, struct tcpcheck_rule *rule, int last_read); +enum tcpcheck_eval_ret tcpcheck_eval_action_kw(struct check *check, struct tcpcheck_rule *rule); +int tcpcheck_main(struct check *check); +struct tcpcheck_rule *parse_tcpcheck_action(char **args, int cur_arg, struct proxy *px, + struct list *rules, struct action_kw *kw, + const char *file, int line, char **errmsg); +struct tcpcheck_rule *parse_tcpcheck_connect(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg); +struct tcpcheck_rule *parse_tcpcheck_send(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg); +struct tcpcheck_rule *parse_tcpcheck_send_http(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg); +struct tcpcheck_rule *parse_tcpcheck_comment(char **args, int cur_arg, struct proxy *px, struct list *rules, + const char *file, int line, char **errmsg); +struct tcpcheck_rule *parse_tcpcheck_expect(char **args, int cur_arg, struct proxy *px, + struct list *rules, unsigned int proto, + const char *file, int line, char **errmsg); + +int proxy_parse_tcp_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_redis_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_ssl_hello_chk_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_smtpchk_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_pgsql_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_mysql_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_ldap_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_spop_check_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); +int proxy_parse_httpchk_opt(char **args, int cur_arg, struct proxy *curpx, const struct proxy *defpx, + const char *file, int line); + +void tcp_check_keywords_register(struct action_kw_list *kw_list); + +/* Return the struct action_kw associated to a keyword */ +static inline struct action_kw *action_kw_tcp_check_lookup(const char *kw) +{ + return action_lookup(&tcp_check_keywords.list, kw); +} + +static inline void action_kw_tcp_check_build_list(struct buffer *chk) +{ + action_build_list(&tcp_check_keywords.list, chk); +} + +#endif /* _HAPROXY_TCPCHECK_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/thread-t.h b/include/haproxy/thread-t.h new file mode 100644 index 0000000..f3552c2 --- /dev/null +++ b/include/haproxy/thread-t.h @@ -0,0 +1,165 @@ +/* + * include/haproxy/thread-t.h + * Definitions and types for thread support. + * + * Copyright (C) 2017 Christopher Faulet - cfaulet@haproxy.com + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_THREAD_T_H +#define _HAPROXY_THREAD_T_H + +#include <haproxy/defaults.h> + +/* Note: this file mainly contains 3 sections: + * - one used solely when USE_THREAD is *not* set + * - one used solely when USE_THREAD is set + * - a common one. + */ + +#ifndef USE_THREAD + +/********************** THREADS DISABLED ************************/ + +/* These macros allow to make some struct fields or local variables optional */ +#define __decl_spinlock(lock) +#define __decl_aligned_spinlock(lock) +#define __decl_rwlock(lock) +#define __decl_aligned_rwlock(lock) + +#elif !defined(DEBUG_THREAD) && !defined(DEBUG_FULL) + +/************** THREADS ENABLED WITHOUT DEBUGGING **************/ + +/* declare a self-initializing spinlock */ +#define __decl_spinlock(lock) \ + HA_SPINLOCK_T (lock) = 0; + +/* declare a self-initializing spinlock, aligned on a cache line */ +#define __decl_aligned_spinlock(lock) \ + HA_SPINLOCK_T (lock) __attribute__((aligned(64))) = 0; + +/* declare a self-initializing rwlock */ +#define __decl_rwlock(lock) \ + HA_RWLOCK_T (lock) = 0; + +/* declare a self-initializing rwlock, aligned on a cache line */ +#define __decl_aligned_rwlock(lock) \ + HA_RWLOCK_T (lock) __attribute__((aligned(64))) = 0; + +#else /* !USE_THREAD */ + +/**************** THREADS ENABLED WITH DEBUGGING ***************/ + +/* declare a self-initializing spinlock */ +#define __decl_spinlock(lock) \ + HA_SPINLOCK_T (lock); \ + INITCALL1(STG_LOCK, ha_spin_init, &(lock)) + +/* declare a self-initializing spinlock, aligned on a cache line */ +#define __decl_aligned_spinlock(lock) \ + HA_SPINLOCK_T (lock) __attribute__((aligned(64))); \ + INITCALL1(STG_LOCK, ha_spin_init, &(lock)) + +/* declare a self-initializing rwlock */ +#define __decl_rwlock(lock) \ + HA_RWLOCK_T (lock); \ + INITCALL1(STG_LOCK, ha_rwlock_init, &(lock)) + +/* declare a self-initializing rwlock, aligned on a cache line */ +#define __decl_aligned_rwlock(lock) \ + HA_RWLOCK_T (lock) __attribute__((aligned(64))); \ + INITCALL1(STG_LOCK, ha_rwlock_init, &(lock)) + +#endif /* USE_THREAD */ + + +/*** Common parts below ***/ + +/* storage types used by spinlocks and RW locks */ +#define __HA_SPINLOCK_T unsigned long +#define __HA_RWLOCK_T unsigned long + + +/* When thread debugging is enabled, we remap HA_SPINLOCK_T and HA_RWLOCK_T to + * complex structures which embed debugging info. + */ +#if !defined(DEBUG_THREAD) && !defined(DEBUG_FULL) + +#define HA_SPINLOCK_T __HA_SPINLOCK_T +#define HA_RWLOCK_T __HA_RWLOCK_T + +#else /* !DEBUG_THREAD */ + +#define HA_SPINLOCK_T struct ha_spinlock +#define HA_RWLOCK_T struct ha_rwlock + +/* Debugging information that is only used when thread debugging is enabled */ + +struct lock_stat { + uint64_t nsec_wait_for_write; + uint64_t nsec_wait_for_read; + uint64_t nsec_wait_for_seek; + uint64_t num_write_locked; + uint64_t num_write_unlocked; + uint64_t num_read_locked; + uint64_t num_read_unlocked; + uint64_t num_seek_locked; + uint64_t num_seek_unlocked; +}; + +struct ha_spinlock_state { + unsigned long owner; /* a bit is set to 1 << tid for the lock owner */ + unsigned long waiters; /* a bit is set to 1 << tid for waiting threads */ +}; + +struct ha_rwlock_state { + unsigned long cur_writer; /* a bit is set to 1 << tid for the lock owner */ + unsigned long wait_writers; /* a bit is set to 1 << tid for waiting writers */ + unsigned long cur_readers; /* a bit is set to 1 << tid for current readers */ + unsigned long wait_readers; /* a bit is set to 1 << tid for waiting waiters */ + unsigned long cur_seeker; /* a bit is set to 1 << tid for the lock seekers */ + unsigned long wait_seekers; /* a bit is set to 1 << tid for waiting seekers */ +}; + +struct ha_spinlock { + __HA_SPINLOCK_T lock; + struct { + struct ha_spinlock_state st[MAX_TGROUPS]; + struct { + const char *function; + const char *file; + int line; + } last_location; /* location of the last owner */ + } info; +}; + +struct ha_rwlock { + __HA_RWLOCK_T lock; + struct { + struct ha_rwlock_state st[MAX_TGROUPS]; + struct { + const char *function; + const char *file; + int line; + } last_location; /* location of the last write owner */ + } info; +}; + +#endif /* DEBUG_THREAD */ + +#endif /* _HAPROXY_THREAD_T_H */ diff --git a/include/haproxy/thread.h b/include/haproxy/thread.h new file mode 100644 index 0000000..8c7520b --- /dev/null +++ b/include/haproxy/thread.h @@ -0,0 +1,489 @@ +/* + * include/haproxy/thread.h + * definitions, macros and inline functions used by threads. + * + * Copyright (C) 2017 Christopher Faulet - cfaulet@haproxy.com + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_THREAD_H +#define _HAPROXY_THREAD_H + +#include <haproxy/api.h> +#include <haproxy/thread-t.h> +#include <haproxy/tinfo.h> + + +/* Note: this file mainly contains 5 sections: + * - a small common part, which also corresponds to the common API + * - one used solely when USE_THREAD is *not* set + * - one used solely when USE_THREAD is set + * - one used solely when USE_THREAD is set WITHOUT debugging + * - one used solely when USE_THREAD is set WITH debugging + * + */ + + +/* Generic exports */ +int parse_nbthread(const char *arg, char **err); +void ha_tkill(unsigned int thr, int sig); +void ha_tkillall(int sig); +void ha_thread_relax(void); +int thread_detect_binding_discrepancies(void); +int thread_detect_more_than_cpus(void); +int thread_map_to_groups(); +int thread_resolve_group_mask(struct thread_set *ts, int defgrp, char **err); +int parse_thread_set(const char *arg, struct thread_set *ts, char **err); +extern int thread_cpus_enabled_at_boot; + + +#ifndef USE_THREAD + +/********************** THREADS DISABLED ************************/ + +/* Only way found to replace variables with constants that are optimized away + * at build time. + */ +enum { all_tgroups_mask = 1UL }; +enum { tid_bit = 1UL }; +enum { tid = 0 }; +enum { tgid = 1 }; + +#define HA_SPIN_INIT(l) do { /* do nothing */ } while(0) +#define HA_SPIN_DESTROY(l) do { /* do nothing */ } while(0) +#define HA_SPIN_LOCK(lbl, l) do { /* do nothing */ } while(0) +#define HA_SPIN_TRYLOCK(lbl, l) ({ 0; }) +#define HA_SPIN_UNLOCK(lbl, l) do { /* do nothing */ } while(0) + +#define HA_RWLOCK_INIT(l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_DESTROY(l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_WRLOCK(lbl, l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_TRYWRLOCK(lbl, l) ({ 0; }) +#define HA_RWLOCK_WRUNLOCK(lbl, l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_RDLOCK(lbl, l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_TRYRDLOCK(lbl, l) ({ 0; }) +#define HA_RWLOCK_RDUNLOCK(lbl, l) do { /* do nothing */ } while(0) + +#define HA_RWLOCK_SKLOCK(lbl,l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_SKTOWR(lbl,l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_WRTOSK(lbl,l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_SKTORD(lbl,l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_WRTORD(lbl,l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_SKUNLOCK(lbl,l) do { /* do nothing */ } while(0) +#define HA_RWLOCK_TRYSKLOCK(lbl,l) ({ 0; }) +#define HA_RWLOCK_TRYRDTOSK(lbl,l) ({ 0; }) + +#define ha_sigmask(how, set, oldset) sigprocmask(how, set, oldset) + +/* Sets the current thread to a valid one described by <thr>, or to any thread + * and any group if NULL (e.g. for use during boot where they're not totally + * initialized). + */ +static inline void ha_set_thread(const struct thread_info *thr) +{ + if (thr) { + ti = thr; + tg = ti->tg; + th_ctx = &ha_thread_ctx[ti->tid]; + } else { + ti = &ha_thread_info[0]; + tg = &ha_tgroup_info[0]; + th_ctx = &ha_thread_ctx[0]; + } +} + +static inline void thread_idle_now() +{ +} + +static inline void thread_idle_end() +{ +} + +static inline void thread_harmless_now() +{ +} + +static inline int is_thread_harmless() +{ + return 1; +} + +static inline void thread_harmless_end() +{ +} + +static inline void thread_harmless_end_sig() +{ +} + +static inline void thread_isolate() +{ +} + +static inline void thread_isolate_full() +{ +} + +static inline void thread_release() +{ +} + +static inline unsigned long thread_isolated() +{ + return 1; +} + +static inline void setup_extra_threads(void *(*handler)(void *)) +{ +} + +static inline void wait_for_threads_completion() +{ +} + +static inline void set_thread_cpu_affinity() +{ +} + +static inline unsigned long long ha_get_pthread_id(unsigned int thr) +{ + return 0; +} + +#else /* !USE_THREAD */ + +/********************** THREADS ENABLED ************************/ + +#define PLOCK_LORW_INLINE_WAIT +#include <import/plock.h> + +void thread_harmless_till_end(void); +void thread_isolate(void); +void thread_isolate_full(void); +void thread_release(void); +void ha_spin_init(HA_SPINLOCK_T *l); +void ha_rwlock_init(HA_RWLOCK_T *l); +void setup_extra_threads(void *(*handler)(void *)); +void wait_for_threads_completion(); +void set_thread_cpu_affinity(); +unsigned long long ha_get_pthread_id(unsigned int thr); + +extern volatile unsigned long all_tgroups_mask; +extern volatile unsigned int rdv_requests; +extern volatile unsigned int isolated_thread; +extern THREAD_LOCAL unsigned int tid; /* The thread id */ +extern THREAD_LOCAL unsigned int tgid; /* The thread group id (starts at 1) */ + +#define ha_sigmask(how, set, oldset) pthread_sigmask(how, set, oldset) + +/* Sets the current thread to a valid one described by <thr>, or to any thread + * and any group if NULL (e.g. for use during boot where they're not totally + * initialized). + */ +static inline void ha_set_thread(const struct thread_info *thr) +{ + if (thr) { + BUG_ON(!thr->ltid_bit); + BUG_ON(!thr->tg); + BUG_ON(!thr->tgid); + + ti = thr; + tg = thr->tg; + tid = thr->tid; + tgid = thr->tgid; + th_ctx = &ha_thread_ctx[tid]; + tg_ctx = &ha_tgroup_ctx[tgid-1]; + } else { + tgid = 1; + tid = 0; + ti = &ha_thread_info[0]; + tg = &ha_tgroup_info[0]; + th_ctx = &ha_thread_ctx[0]; + tg_ctx = &ha_tgroup_ctx[0]; + } +} + +/* Marks the thread as idle, which means that not only it's not doing anything + * dangerous, but in addition it has not started anything sensitive either. + * This essentially means that the thread currently is in the poller, thus + * outside of any execution block. Needs to be terminated using + * thread_idle_end(). This is needed to release a concurrent call to + * thread_isolate_full(). + */ +static inline void thread_idle_now() +{ + HA_ATOMIC_OR(&tg_ctx->threads_idle, ti->ltid_bit); +} + +/* Ends the harmless period started by thread_idle_now(), i.e. the thread is + * about to restart engaging in sensitive operations. This must not be done on + * a thread marked harmless, as it could cause a deadlock between another + * thread waiting for idle again and thread_harmless_end() in this thread. + * + * The right sequence is thus: + * thread_idle_now(); + * thread_harmless_now(); + * poll(); + * thread_harmless_end(); + * thread_idle_end(); + */ +static inline void thread_idle_end() +{ + HA_ATOMIC_AND(&tg_ctx->threads_idle, ~ti->ltid_bit); +} + + +/* Marks the thread as harmless. Note: this must be true, i.e. the thread must + * not be touching any unprotected shared resource during this period. Usually + * this is called before poll(), but it may also be placed around very slow + * calls (eg: some crypto operations). Needs to be terminated using + * thread_harmless_end(). + */ +static inline void thread_harmless_now() +{ + HA_ATOMIC_OR(&tg_ctx->threads_harmless, ti->ltid_bit); +} + +/* Returns non-zero if the current thread is already harmless */ +static inline int is_thread_harmless() +{ + return !!(HA_ATOMIC_LOAD(&tg_ctx->threads_harmless) & ti->ltid_bit); +} + +/* Ends the harmless period started by thread_harmless_now(). Usually this is + * placed after the poll() call. If it is discovered that a job was running and + * is relying on the thread still being harmless, the thread waits for the + * other one to finish. + */ +static inline void thread_harmless_end() +{ + while (1) { + HA_ATOMIC_AND(&tg_ctx->threads_harmless, ~ti->ltid_bit); + if (likely(_HA_ATOMIC_LOAD(&rdv_requests) == 0)) + break; + thread_harmless_till_end(); + } +} + +/* Ends the harmless period started by thread_harmless_now(), but without + * waiting for isolated requests. This is meant to be used from signal handlers + * which might be called recursively while a thread already requested an + * isolation that must be ignored. It must not be used past a checkpoint where + * another thread could return and see the current thread as harmless before + * this call (or this could validate an isolation request by accident). + */ +static inline void thread_harmless_end_sig() +{ + HA_ATOMIC_AND(&tg_ctx->threads_harmless, ~ti->ltid_bit); +} + +/* an isolated thread has its ID in isolated_thread */ +static inline unsigned long thread_isolated() +{ + return _HA_ATOMIC_LOAD(&isolated_thread) == tid; +} + +/* Returns 1 if the cpu set is currently restricted for the process else 0. + * Currently only implemented for the Linux platform. + */ +int thread_cpu_mask_forced(void); + +#if !defined(DEBUG_THREAD) && !defined(DEBUG_FULL) + +/* Thread debugging is DISABLED, these are the regular locking functions */ + +#define HA_SPIN_INIT(l) ({ (*l) = 0; }) +#define HA_SPIN_DESTROY(l) ({ (*l) = 0; }) +#define HA_SPIN_LOCK(lbl, l) pl_take_s(l) +#define HA_SPIN_TRYLOCK(lbl, l) (!pl_try_s(l)) +#define HA_SPIN_UNLOCK(lbl, l) pl_drop_s(l) + +#define HA_RWLOCK_INIT(l) ({ (*l) = 0; }) +#define HA_RWLOCK_DESTROY(l) ({ (*l) = 0; }) +#define HA_RWLOCK_WRLOCK(lbl,l) pl_take_w(l) +#define HA_RWLOCK_TRYWRLOCK(lbl,l) (!pl_try_w(l)) +#define HA_RWLOCK_WRUNLOCK(lbl,l) pl_drop_w(l) +#define HA_RWLOCK_RDLOCK(lbl,l) pl_take_r(l) +#define HA_RWLOCK_TRYRDLOCK(lbl,l) (!pl_try_r(l)) +#define HA_RWLOCK_RDUNLOCK(lbl,l) pl_drop_r(l) + +/* rwlock upgrades via seek locks */ +#define HA_RWLOCK_SKLOCK(lbl,l) pl_take_s(l) /* N --> S */ +#define HA_RWLOCK_SKTOWR(lbl,l) pl_stow(l) /* S --> W */ +#define HA_RWLOCK_WRTOSK(lbl,l) pl_wtos(l) /* W --> S */ +#define HA_RWLOCK_SKTORD(lbl,l) pl_stor(l) /* S --> R */ +#define HA_RWLOCK_WRTORD(lbl,l) pl_wtor(l) /* W --> R */ +#define HA_RWLOCK_SKUNLOCK(lbl,l) pl_drop_s(l) /* S --> N */ +#define HA_RWLOCK_TRYSKLOCK(lbl,l) (!pl_try_s(l)) /* N -?> S */ +#define HA_RWLOCK_TRYRDTOSK(lbl,l) (!pl_try_rtos(l)) /* R -?> S */ + +#else /* !defined(DEBUG_THREAD) && !defined(DEBUG_FULL) */ + +/* Thread debugging is ENABLED, these are the instrumented functions */ + +#define __SPIN_INIT(l) ({ (*l) = 0; }) +#define __SPIN_DESTROY(l) ({ (*l) = 0; }) +#define __SPIN_LOCK(l) pl_take_s(l) +#define __SPIN_TRYLOCK(l) (!pl_try_s(l)) +#define __SPIN_UNLOCK(l) pl_drop_s(l) + +#define __RWLOCK_INIT(l) ({ (*l) = 0; }) +#define __RWLOCK_DESTROY(l) ({ (*l) = 0; }) +#define __RWLOCK_WRLOCK(l) pl_take_w(l) +#define __RWLOCK_TRYWRLOCK(l) (!pl_try_w(l)) +#define __RWLOCK_WRUNLOCK(l) pl_drop_w(l) +#define __RWLOCK_RDLOCK(l) pl_take_r(l) +#define __RWLOCK_TRYRDLOCK(l) (!pl_try_r(l)) +#define __RWLOCK_RDUNLOCK(l) pl_drop_r(l) + +/* rwlock upgrades via seek locks */ +#define __RWLOCK_SKLOCK(l) pl_take_s(l) /* N --> S */ +#define __RWLOCK_SKTOWR(l) pl_stow(l) /* S --> W */ +#define __RWLOCK_WRTOSK(l) pl_wtos(l) /* W --> S */ +#define __RWLOCK_SKTORD(l) pl_stor(l) /* S --> R */ +#define __RWLOCK_WRTORD(l) pl_wtor(l) /* W --> R */ +#define __RWLOCK_SKUNLOCK(l) pl_drop_s(l) /* S --> N */ +#define __RWLOCK_TRYSKLOCK(l) (!pl_try_s(l)) /* N -?> S */ +#define __RWLOCK_TRYRDTOSK(l) (!pl_try_rtos(l)) /* R -?> S */ + +#define HA_SPIN_INIT(l) __spin_init(l) +#define HA_SPIN_DESTROY(l) __spin_destroy(l) + +#define HA_SPIN_LOCK(lbl, l) __spin_lock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_SPIN_TRYLOCK(lbl, l) __spin_trylock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_SPIN_UNLOCK(lbl, l) __spin_unlock(lbl, l, __func__, __FILE__, __LINE__) + +#define HA_RWLOCK_INIT(l) __ha_rwlock_init((l)) +#define HA_RWLOCK_DESTROY(l) __ha_rwlock_destroy((l)) +#define HA_RWLOCK_WRLOCK(lbl,l) __ha_rwlock_wrlock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_TRYWRLOCK(lbl,l) __ha_rwlock_trywrlock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_WRUNLOCK(lbl,l) __ha_rwlock_wrunlock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_RDLOCK(lbl,l) __ha_rwlock_rdlock(lbl, l) +#define HA_RWLOCK_TRYRDLOCK(lbl,l) __ha_rwlock_tryrdlock(lbl, l) +#define HA_RWLOCK_RDUNLOCK(lbl,l) __ha_rwlock_rdunlock(lbl, l) + +#define HA_RWLOCK_SKLOCK(lbl,l) __ha_rwlock_sklock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_SKTOWR(lbl,l) __ha_rwlock_sktowr(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_WRTOSK(lbl,l) __ha_rwlock_wrtosk(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_SKTORD(lbl,l) __ha_rwlock_sktord(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_WRTORD(lbl,l) __ha_rwlock_wrtord(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_SKUNLOCK(lbl,l) __ha_rwlock_skunlock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_TRYSKLOCK(lbl,l) __ha_rwlock_trysklock(lbl, l, __func__, __FILE__, __LINE__) +#define HA_RWLOCK_TRYRDTOSK(lbl,l) __ha_rwlock_tryrdtosk(lbl, l, __func__, __FILE__, __LINE__) + +/* WARNING!!! if you update this enum, please also keep lock_label() up to date + * below. + */ +enum lock_label { + TASK_RQ_LOCK, + TASK_WQ_LOCK, + LISTENER_LOCK, + PROXY_LOCK, + SERVER_LOCK, + LBPRM_LOCK, + SIGNALS_LOCK, + STK_TABLE_LOCK, + STK_SESS_LOCK, + APPLETS_LOCK, + PEER_LOCK, + SHCTX_LOCK, + SSL_LOCK, + SSL_GEN_CERTS_LOCK, + PATREF_LOCK, + PATEXP_LOCK, + VARS_LOCK, + COMP_POOL_LOCK, + LUA_LOCK, + NOTIF_LOCK, + SPOE_APPLET_LOCK, + DNS_LOCK, + PID_LIST_LOCK, + EMAIL_ALERTS_LOCK, + PIPES_LOCK, + TLSKEYS_REF_LOCK, + AUTH_LOCK, + RING_LOCK, + DICT_LOCK, + PROTO_LOCK, + QUEUE_LOCK, + CKCH_LOCK, + SNI_LOCK, + SSL_SERVER_LOCK, + SFT_LOCK, /* sink forward target */ + IDLE_CONNS_LOCK, + OCSP_LOCK, + QC_CID_LOCK, + CACHE_LOCK, + OTHER_LOCK, + /* WT: make sure never to use these ones outside of development, + * we need them for lock profiling! + */ + DEBUG1_LOCK, + DEBUG2_LOCK, + DEBUG3_LOCK, + DEBUG4_LOCK, + DEBUG5_LOCK, + LOCK_LABELS +}; + + +/* Following functions are used to collect some stats about locks. We wrap + * pthread functions to known how much time we wait in a lock. */ + +void show_lock_stats(); +void __ha_rwlock_init(struct ha_rwlock *l); +void __ha_rwlock_destroy(struct ha_rwlock *l); +void __ha_rwlock_wrlock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +int __ha_rwlock_trywrlock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_wrunlock(enum lock_label lbl,struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_rdlock(enum lock_label lbl,struct ha_rwlock *l); +int __ha_rwlock_tryrdlock(enum lock_label lbl,struct ha_rwlock *l); +void __ha_rwlock_rdunlock(enum lock_label lbl,struct ha_rwlock *l); +void __ha_rwlock_wrtord(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_wrtosk(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_sklock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_sktowr(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_sktord(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __ha_rwlock_skunlock(enum lock_label lbl,struct ha_rwlock *l, + const char *func, const char *file, int line); +int __ha_rwlock_trysklock(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +int __ha_rwlock_tryrdtosk(enum lock_label lbl, struct ha_rwlock *l, + const char *func, const char *file, int line); +void __spin_init(struct ha_spinlock *l); +void __spin_destroy(struct ha_spinlock *l); +void __spin_lock(enum lock_label lbl, struct ha_spinlock *l, + const char *func, const char *file, int line); +int __spin_trylock(enum lock_label lbl, struct ha_spinlock *l, + const char *func, const char *file, int line); +void __spin_unlock(enum lock_label lbl, struct ha_spinlock *l, + const char *func, const char *file, int line); + +#endif /* DEBUG_THREAD */ + +#endif /* USE_THREAD */ + +#endif /* _HAPROXY_THREAD_H */ diff --git a/include/haproxy/ticks.h b/include/haproxy/ticks.h new file mode 100644 index 0000000..8b8fcc6 --- /dev/null +++ b/include/haproxy/ticks.h @@ -0,0 +1,157 @@ +/* + * include/haproxy/ticks.h + * Functions and macros for manipulation of expiration timers + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * Using a mix of milliseconds and timeval for internal timers is expensive and + * overkill, because we don't need such a precision to compute timeouts. + * So we're converting them to "ticks". + * + * A tick is a representation of a date relative to another one, and is + * measured in milliseconds. The natural usage is to represent an absolute date + * relative to the current date. Since it is not practical to update all values + * each time the current date changes, instead we use the absolute date rounded + * down to fit in a tick. We then have to compare a tick to the current date to + * know whether it is in the future or in the past. If a tick is below the + * current date, it is in the past. If it is above, it is in the future. The + * values will wrap so we can't compare that easily, instead we check the sign + * of the difference between a tick and the current date. + * + * Proceeding like this allows us to manipulate dates that are stored in + * scalars with enough precision and range. For this reason, we store ticks in + * 32-bit integers. This is enough to handle dates that are between 24.85 days + * in the past and as much in the future. + * + * We must both support absolute dates (well in fact, dates relative to now+/- + * 24 days), and intervals (for timeouts). Both types need an "eternity" magic + * value. For optimal code generation, we'll use zero as the magic value + * indicating that an expiration timer or a timeout is not set. We have to + * check that we don't return this value when adding timeouts to <now>. If a + * computation returns 0, we must increase it to 1 (which will push the timeout + * 1 ms further). For this reason, timeouts must not be added by hand but via + * the dedicated tick_add() function. + */ + +#ifndef _HAPROXY_TICKS_H +#define _HAPROXY_TICKS_H + +#include <haproxy/api.h> + +#define TICK_ETERNITY 0 + +/* right now, ticks are milliseconds. Both negative ms and negative ticks + * indicate eternity. + */ +#define MS_TO_TICKS(ms) (ms) +#define TICKS_TO_MS(tk) (tk) + +/* currently updated and stored in time.c */ +extern THREAD_LOCAL unsigned int now_ms; /* internal date in milliseconds (may wrap) */ +extern volatile unsigned int global_now_ms; + +/* return 1 if tick is set, otherwise 0 */ +static inline int tick_isset(int expire) +{ + return expire != 0; +} + +/* Add <timeout> to <now>, and return the resulting expiration date. + * <timeout> will not be checked for null values. + */ +static inline int tick_add(int now, int timeout) +{ + now += timeout; + if (unlikely(!now)) + now++; /* unfortunate value */ + return now; +} + +/* add <timeout> to <now> if it is set, otherwise set it to eternity. + * Return the resulting expiration date. + */ +static inline int tick_add_ifset(int now, int timeout) +{ + if (!timeout) + return TICK_ETERNITY; + return tick_add(now, timeout); +} + +/* return 1 if timer <t1> is before <t2>, none of which can be infinite. */ +static inline int tick_is_lt(int t1, int t2) +{ + return (t1 - t2) < 0; +} + +/* return 1 if timer <t1> is before or equal to <t2>, none of which can be infinite. */ +static inline int tick_is_le(int t1, int t2) +{ + return (t1 - t2) <= 0; +} + +/* return 1 if timer <timer> is expired at date <now>, otherwise zero */ +static inline int tick_is_expired(int timer, int now) +{ + if (unlikely(!tick_isset(timer))) + return 0; + if (unlikely((timer - now) <= 0)) + return 1; + return 0; +} + +/* return the first one of the two timers, both of which may be infinite */ +static inline int tick_first(int t1, int t2) +{ + if (!tick_isset(t1)) + return t2; + if (!tick_isset(t2)) + return t1; + if ((t1 - t2) <= 0) + return t1; + else + return t2; +} + +/* return the first one of the two timers, where only the first one may be infinite */ +static inline int tick_first_2nz(int t1, int t2) +{ + if (!tick_isset(t1)) + return t2; + if ((t1 - t2) <= 0) + return t1; + else + return t2; +} + +/* return the number of ticks remaining from <now> to <exp>, or zero if expired */ +static inline int tick_remain(int now, int exp) +{ + if (tick_is_expired(exp, now)) + return 0; + return exp - now; +} + +#endif /* _HAPROXY_TICKS_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/time.h b/include/haproxy/time.h new file mode 100644 index 0000000..3ebc683 --- /dev/null +++ b/include/haproxy/time.h @@ -0,0 +1,520 @@ +/* + * include/haproxy/time.h + * timeval-based time calculation functions and macros. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TIME_H +#define _HAPROXY_TIME_H + +#include <sys/time.h> +#include <haproxy/api.h> + +#define TIME_ETERNITY (TV_ETERNITY_MS) + + + +/**** exported functions *************************************************/ +/* + * adds <ms> ms to <from>, set the result to <tv> and returns a pointer <tv> + */ +struct timeval *tv_ms_add(struct timeval *tv, const struct timeval *from, int ms); + +/* + * compares <tv1> and <tv2> modulo 1ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2 + * Must not be used when either argument is eternity. Use tv_ms_cmp2() for that. + */ +int tv_ms_cmp(const struct timeval *tv1, const struct timeval *tv2); + +/* + * compares <tv1> and <tv2> modulo 1 ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2, + * assuming that TV_ETERNITY is greater than everything. + */ +int tv_ms_cmp2(const struct timeval *tv1, const struct timeval *tv2); + + +/**** general purpose functions and macros *******************************/ + + +/* + * sets a struct timeval to its highest value so that it can never happen + * note that only tv_usec is necessary to detect it since a tv_usec > 999999 + * is normally not possible. + */ +static inline struct timeval *tv_eternity(struct timeval *tv) +{ + tv->tv_sec = (typeof(tv->tv_sec))TV_ETERNITY; + tv->tv_usec = (typeof(tv->tv_usec))TV_ETERNITY; + return tv; +} + +/* + * sets a struct timeval to 0 + * + */ +static inline struct timeval *tv_zero(struct timeval *tv) { + tv->tv_sec = tv->tv_usec = 0; + return tv; +} + +/* + * returns non null if tv is [eternity], otherwise 0. + */ +#define tv_iseternity(tv) ((tv)->tv_usec == (typeof((tv)->tv_usec))TV_ETERNITY) + +/* + * returns 0 if tv is [eternity], otherwise non-zero. + */ +#define tv_isset(tv) ((tv)->tv_usec != (typeof((tv)->tv_usec))TV_ETERNITY) + +/* + * returns non null if tv is [0], otherwise 0. + */ +#define tv_iszero(tv) (((tv)->tv_sec | (tv)->tv_usec) == 0) + +/* + * Converts a struct timeval to a wrapping number of milliseconds. + */ +static inline uint __tv_to_ms(const struct timeval *tv) +{ + unsigned int ret; + + ret = (uint)tv->tv_sec * 1000; + ret += (uint)tv->tv_usec / 1000; + return ret; +} + +/* + * Converts a struct timeval to a number of milliseconds. + */ +static inline struct timeval * __tv_from_ms(struct timeval *tv, unsigned long ms) +{ + tv->tv_sec = ms / 1000; + tv->tv_usec = (ms % 1000) * 1000; + return tv; +} + +/* + * Converts a struct timeval to a relative timestamp in nanoseconds (only + * wraps every 585 years, i.e. never for our purpose). + */ +static forceinline ullong tv_to_ns(const struct timeval *tv) +{ + ullong ret; + + ret = (ullong)tv->tv_sec * 1000000000ULL; + ret += (ullong)tv->tv_usec * 1000ULL; + return ret; +} + +/* turns nanoseconds to seconds, just to avoid typos */ +static forceinline uint ns_to_sec(ullong ns) +{ + return ns / 1000000000ULL; +} + +/* turns nanoseconds to milliseconds, just to avoid typos */ +static forceinline uint ns_to_ms(ullong ns) +{ + return ns / 1000000ULL; +} + +/* turns seconds to nanoseconds, just to avoid typos */ +static forceinline ullong sec_to_ns(uint sec) +{ + return sec * 1000000000ULL; +} + +/* turns milliseconds to nanoseconds, just to avoid typos */ +static forceinline ullong ms_to_ns(uint ms) +{ + return ms * 1000000ULL; +} + +/* turns microseconds to nanoseconds, just to avoid typos */ +static forceinline ullong us_to_ns(uint us) +{ + return us * 1000ULL; +} + +/* creates a struct timeval from a relative timestamp in nanosecond */ +#define NS_TO_TV(t) ((const struct timeval){ .tv_sec = (t) / 1000000000ULL, .tv_usec = ((t) % 1000000000ULL) / 1000U }) + +/* Return a number of 1024Hz ticks between 0 and 1023 for input number of + * usecs between 0 and 999999. This function has been optimized to remove + * any divide and multiply, as it is completely optimized away by the compiler + * on CPUs which don't have a fast multiply. Its avg error rate is 305 ppm, + * which is almost twice as low as a direct usec to ms conversion. This version + * also has the benefit of returning 1024 for 1000000. + */ +static inline unsigned int __usec_to_1024th(unsigned int usec) +{ + return (usec * 1073 + 742516) >> 20; +} + + +/**** comparison functions and macros ***********************************/ + + +/* tv_cmp: compares <tv1> and <tv2> : returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2. */ +static inline int __tv_cmp(const struct timeval *tv1, const struct timeval *tv2) +{ + if ((unsigned)tv1->tv_sec < (unsigned)tv2->tv_sec) + return -1; + else if ((unsigned)tv1->tv_sec > (unsigned)tv2->tv_sec) + return 1; + else if ((unsigned)tv1->tv_usec < (unsigned)tv2->tv_usec) + return -1; + else if ((unsigned)tv1->tv_usec > (unsigned)tv2->tv_usec) + return 1; + else + return 0; +} + +/* tv_iseq: compares <tv1> and <tv2> : returns 1 if tv1 == tv2, otherwise 0 */ +#define tv_iseq __tv_iseq +static inline int __tv_iseq(const struct timeval *tv1, const struct timeval *tv2) +{ + return ((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec) && + ((unsigned)tv1->tv_usec == (unsigned)tv2->tv_usec); +} + +/* tv_isgt: compares <tv1> and <tv2> : returns 1 if tv1 > tv2, otherwise 0 */ +#define tv_isgt _tv_isgt +int _tv_isgt(const struct timeval *tv1, const struct timeval *tv2); +static inline int __tv_isgt(const struct timeval *tv1, const struct timeval *tv2) +{ + return + ((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec) ? + ((unsigned)tv1->tv_usec > (unsigned)tv2->tv_usec) : + ((unsigned)tv1->tv_sec > (unsigned)tv2->tv_sec); +} + +/* tv_isge: compares <tv1> and <tv2> : returns 1 if tv1 >= tv2, otherwise 0 */ +#define tv_isge __tv_isge +static inline int __tv_isge(const struct timeval *tv1, const struct timeval *tv2) +{ + return + ((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec) ? + ((unsigned)tv1->tv_usec >= (unsigned)tv2->tv_usec) : + ((unsigned)tv1->tv_sec > (unsigned)tv2->tv_sec); +} + +/* tv_islt: compares <tv1> and <tv2> : returns 1 if tv1 < tv2, otherwise 0 */ +#define tv_islt __tv_islt +static inline int __tv_islt(const struct timeval *tv1, const struct timeval *tv2) +{ + return + ((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec) ? + ((unsigned)tv1->tv_usec < (unsigned)tv2->tv_usec) : + ((unsigned)tv1->tv_sec < (unsigned)tv2->tv_sec); +} + +/* tv_isle: compares <tv1> and <tv2> : returns 1 if tv1 <= tv2, otherwise 0 */ +#define tv_isle _tv_isle +int _tv_isle(const struct timeval *tv1, const struct timeval *tv2); +static inline int __tv_isle(const struct timeval *tv1, const struct timeval *tv2) +{ + return + ((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec) ? + ((unsigned)tv1->tv_usec <= (unsigned)tv2->tv_usec) : + ((unsigned)tv1->tv_sec < (unsigned)tv2->tv_sec); +} + +/* + * compares <tv1> and <tv2> modulo 1ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2 + * Must not be used when either argument is eternity. Use tv_ms_cmp2() for that. + */ +#define tv_ms_cmp _tv_ms_cmp +int _tv_ms_cmp(const struct timeval *tv1, const struct timeval *tv2); +static inline int __tv_ms_cmp(const struct timeval *tv1, const struct timeval *tv2) +{ + if ((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec) { + if ((unsigned)tv2->tv_usec >= (unsigned)tv1->tv_usec + 1000) + return -1; + else if ((unsigned)tv1->tv_usec >= (unsigned)tv2->tv_usec + 1000) + return 1; + else + return 0; + } + else if (((unsigned)tv2->tv_sec > (unsigned)tv1->tv_sec + 1) || + (((unsigned)tv2->tv_sec == (unsigned)tv1->tv_sec + 1) && + ((unsigned)tv2->tv_usec + 1000000 >= (unsigned)tv1->tv_usec + 1000))) + return -1; + else if (((unsigned)tv1->tv_sec > (unsigned)tv2->tv_sec + 1) || + (((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec + 1) && + ((unsigned)tv1->tv_usec + 1000000 >= (unsigned)tv2->tv_usec + 1000))) + return 1; + else + return 0; +} + +/* + * compares <tv1> and <tv2> modulo 1 ms: returns 0 if equal, -1 if tv1 < tv2, 1 if tv1 > tv2, + * assuming that TV_ETERNITY is greater than everything. + */ +#define tv_ms_cmp2 _tv_ms_cmp2 +int _tv_ms_cmp2(const struct timeval *tv1, const struct timeval *tv2); +static inline int __tv_ms_cmp2(const struct timeval *tv1, const struct timeval *tv2) +{ + if (tv_iseternity(tv1)) + if (tv_iseternity(tv2)) + return 0; /* same */ + else + return 1; /* tv1 later than tv2 */ + else if (tv_iseternity(tv2)) + return -1; /* tv2 later than tv1 */ + return tv_ms_cmp(tv1, tv2); +} + +/* + * compares <tv1> and <tv2> modulo 1 ms: returns 1 if tv1 <= tv2, 0 if tv1 > tv2, + * assuming that TV_ETERNITY is greater than everything. Returns 0 if tv1 is + * TV_ETERNITY, and always assumes that tv2 != TV_ETERNITY. Designed to replace + * occurrences of (tv_ms_cmp2(tv,now) <= 0). + */ +#define tv_ms_le2 _tv_ms_le2 +int _tv_ms_le2(const struct timeval *tv1, const struct timeval *tv2); +static inline int __tv_ms_le2(const struct timeval *tv1, const struct timeval *tv2) +{ + if (likely((unsigned)tv1->tv_sec > (unsigned)tv2->tv_sec + 1)) + return 0; + + if (likely((unsigned)tv1->tv_sec < (unsigned)tv2->tv_sec)) + return 1; + + if (likely((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec)) { + if ((unsigned)tv2->tv_usec >= (unsigned)tv1->tv_usec + 1000) + return 1; + else + return 0; + } + + if (unlikely(((unsigned)tv1->tv_sec == (unsigned)tv2->tv_sec + 1) && + ((unsigned)tv1->tv_usec + 1000000 >= (unsigned)tv2->tv_usec + 1000))) + return 0; + else + return 1; +} + + +/**** operators **********************************************************/ + + +/* + * Returns the time in ms elapsed between tv1 and tv2, assuming that tv1<=tv2. + * Must not be used when either argument is eternity. + */ +#define tv_ms_elapsed __tv_ms_elapsed +unsigned long _tv_ms_elapsed(const struct timeval *tv1, const struct timeval *tv2); +static inline unsigned long __tv_ms_elapsed(const struct timeval *tv1, const struct timeval *tv2) +{ + unsigned long ret; + + ret = ((signed long)(tv2->tv_sec - tv1->tv_sec)) * 1000; + ret += ((signed long)(tv2->tv_usec - tv1->tv_usec)) / 1000; + return ret; +} + +/* + * returns the remaining time between tv1=now and event=tv2 + * if tv2 is passed, 0 is returned. + * Must not be used when either argument is eternity. + */ + +#define tv_ms_remain __tv_ms_remain +unsigned long _tv_ms_remain(const struct timeval *tv1, const struct timeval *tv2); +static inline unsigned long __tv_ms_remain(const struct timeval *tv1, const struct timeval *tv2) +{ + if (tv_ms_cmp(tv1, tv2) >= 0) + return 0; /* event elapsed */ + + return __tv_ms_elapsed(tv1, tv2); +} + +/* + * returns the remaining time between tv1=now and event=tv2 + * if tv2 is passed, 0 is returned. + * Returns TIME_ETERNITY if tv2 is eternity. + */ +#define tv_ms_remain2 _tv_ms_remain2 +unsigned long _tv_ms_remain2(const struct timeval *tv1, const struct timeval *tv2); +static inline unsigned long __tv_ms_remain2(const struct timeval *tv1, const struct timeval *tv2) +{ + if (tv_iseternity(tv2)) + return TIME_ETERNITY; + + return tv_ms_remain(tv1, tv2); +} + +/* + * adds <inc> to <from>, set the result to <tv> and returns a pointer <tv> + */ +#define tv_add _tv_add +struct timeval *_tv_add(struct timeval *tv, const struct timeval *from, const struct timeval *inc); +static inline struct timeval *__tv_add(struct timeval *tv, const struct timeval *from, const struct timeval *inc) +{ + tv->tv_usec = from->tv_usec + inc->tv_usec; + tv->tv_sec = from->tv_sec + inc->tv_sec; + if (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } + return tv; +} + + +/* + * If <inc> is set, then add it to <from> and set the result to <tv>, then + * return 1, otherwise return 0. It is meant to be used in if conditions. + */ +#define tv_add_ifset _tv_add_ifset +int _tv_add_ifset(struct timeval *tv, const struct timeval *from, const struct timeval *inc); +static inline int __tv_add_ifset(struct timeval *tv, const struct timeval *from, const struct timeval *inc) +{ + if (tv_iseternity(inc)) + return 0; + tv->tv_usec = from->tv_usec + inc->tv_usec; + tv->tv_sec = from->tv_sec + inc->tv_sec; + if (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } + return 1; +} + +/* + * adds <inc> to <tv> and returns a pointer <tv> + */ +static inline struct timeval *__tv_add2(struct timeval *tv, const struct timeval *inc) +{ + tv->tv_usec += inc->tv_usec; + tv->tv_sec += inc->tv_sec; + if (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } + return tv; +} + + +/* + * Computes the remaining time between tv1=now and event=tv2. if tv2 is passed, + * 0 is returned. The result is stored into tv. + */ +#define tv_remain _tv_remain +struct timeval *_tv_remain(const struct timeval *tv1, const struct timeval *tv2, struct timeval *tv); +static inline struct timeval *__tv_remain(const struct timeval *tv1, const struct timeval *tv2, struct timeval *tv) +{ + tv->tv_usec = tv2->tv_usec - tv1->tv_usec; + tv->tv_sec = tv2->tv_sec - tv1->tv_sec; + if ((signed)tv->tv_sec > 0) { + if ((signed)tv->tv_usec < 0) { + tv->tv_usec += 1000000; + tv->tv_sec--; + } + } else if (tv->tv_sec == 0) { + if ((signed)tv->tv_usec < 0) + tv->tv_usec = 0; + } else { + tv->tv_sec = 0; + tv->tv_usec = 0; + } + return tv; +} + + +/* + * Computes the remaining time between tv1=now and event=tv2. if tv2 is passed, + * 0 is returned. The result is stored into tv. Returns ETERNITY if tv2 is + * eternity. + */ +#define tv_remain2 _tv_remain2 +struct timeval *_tv_remain2(const struct timeval *tv1, const struct timeval *tv2, struct timeval *tv); +static inline struct timeval *__tv_remain2(const struct timeval *tv1, const struct timeval *tv2, struct timeval *tv) +{ + if (tv_iseternity(tv2)) + return tv_eternity(tv); + return __tv_remain(tv1, tv2, tv); +} + + +/* + * adds <ms> ms to <from>, set the result to <tv> and returns a pointer <tv> + */ +#define tv_ms_add _tv_ms_add +struct timeval *_tv_ms_add(struct timeval *tv, const struct timeval *from, int ms); +static inline struct timeval *__tv_ms_add(struct timeval *tv, const struct timeval *from, int ms) +{ + tv->tv_usec = from->tv_usec + (ms % 1000) * 1000; + tv->tv_sec = from->tv_sec + (ms / 1000); + while (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } + return tv; +} + + +/* + * compares <tv1> and <tv2> : returns 1 if <tv1> is before <tv2>, otherwise 0. + * This should be very fast because it's used in schedulers. + * It has been optimized to return 1 (so call it in a loop which continues + * as long as tv1<=tv2) + */ + +#define tv_isbefore(tv1, tv2) \ + (unlikely((unsigned)(tv1)->tv_sec < (unsigned)(tv2)->tv_sec) ? 1 : \ + (unlikely((unsigned)(tv1)->tv_sec > (unsigned)(tv2)->tv_sec) ? 0 : \ + unlikely((unsigned)(tv1)->tv_usec < (unsigned)(tv2)->tv_usec))) + +/* + * returns the first event between <tv1> and <tv2> into <tvmin>. + * a zero tv is ignored. <tvmin> is returned. If <tvmin> is known + * to be the same as <tv1> or <tv2>, it is recommended to use + * tv_bound instead. + */ +#define tv_min(tvmin, tv1, tv2) ({ \ + if (tv_isbefore(tv1, tv2)) { \ + *tvmin = *tv1; \ + } \ + else { \ + *tvmin = *tv2; \ + } \ + tvmin; \ +}) + +/* + * returns the first event between <tv1> and <tv2> into <tvmin>. + * a zero tv is ignored. <tvmin> is returned. This function has been + * optimized to be called as tv_min(a,a,b) or tv_min(b,a,b). + */ +#define tv_bound(tv1, tv2) ({ \ + if (tv_isbefore(tv2, tv1)) \ + *tv1 = *tv2; \ + tv1; \ +}) + +#endif /* _HAPROXY_TIME_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/timeshift.h b/include/haproxy/timeshift.h new file mode 100644 index 0000000..62e5855 --- /dev/null +++ b/include/haproxy/timeshift.h @@ -0,0 +1,10 @@ +#include <sys/time.h> +#include <sys/epoll.h> + +#define gettimeofday(tv, tz) timeshift_gettimeofday(tv, tz) +#define clock_gettime(clk_id, tp) timeshift_clock_gettime(clk_id, tp) +#define epoll_wait(epfd, events, maxevents, timeout) timeshift_epoll_wait(epfd, events, maxevents, timeout) + +int timeshift_gettimeofday(struct timeval *tv, void *tz); +int timeshift_clock_gettime(clockid_t clk_id, struct timespec *tp); +int timeshift_epoll_wait(int epfd, struct epoll_event *events, int maxevents, int timeout); diff --git a/include/haproxy/tinfo-t.h b/include/haproxy/tinfo-t.h new file mode 100644 index 0000000..357c4c0 --- /dev/null +++ b/include/haproxy/tinfo-t.h @@ -0,0 +1,180 @@ +/* + * include/haproxy/tinfo-t.h + * Definitions of the thread_info structure. + * + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TINFO_T_H +#define _HAPROXY_TINFO_T_H + +#include <import/ebtree-t.h> + +#include <haproxy/api-t.h> +#include <haproxy/freq_ctr-t.h> +#include <haproxy/thread-t.h> + +/* forward declarations for types used below */ +struct buffer; + +/* Threads sets are known either by a set of absolute thread numbers, or by a + * set of relative thread numbers within a group, for each group. The default + * is the absolute mode and corresponds to the case where no group is known + * (nbgrp == 0). The mode may only be changed when the set is empty (use + * thread_set_is_empty() for this). + */ +struct thread_set { + union { + ulong abs[(MAX_THREADS + LONGBITS - 1) / LONGBITS]; + ulong rel[MAX_TGROUPS]; + }; + ulong grps; /* bit field of all non-empty groups, 0 for abs */ +}; + +/* tasklet classes */ +enum { + TL_URGENT = 0, /* urgent tasklets (I/O callbacks) */ + TL_NORMAL = 1, /* normal tasks */ + TL_BULK = 2, /* bulk task/tasklets, streaming I/Os */ + TL_HEAVY = 3, /* heavy computational tasklets (e.g. TLS handshakes) */ + TL_CLASSES /* must be last */ +}; + +/* thread_ctx flags, for ha_thread_ctx[].flags. These flags describe the + * thread's state and are visible to other threads, so they must be used + * with atomic ops. + */ +#define TH_FL_STUCK 0x00000001 +#define TH_FL_TASK_PROFILING 0x00000002 +#define TH_FL_NOTIFIED 0x00000004 /* task was notified about the need to wake up */ +#define TH_FL_SLEEPING 0x00000008 /* thread won't check its task list before next wakeup */ +#define TH_FL_STARTED 0x00000010 /* set once the thread starts */ +#define TH_FL_IN_LOOP 0x00000020 /* set only inside the polling loop */ + + +/* Thread group information. This defines a base and a count of global thread + * IDs which belong to it, and which can be looked up into thread_info/ctx. It + * is set up during parsing and is stable during operation. Thread groups start + * at 1 so tgroup[0] describes thread group 1. + */ +struct tgroup_info { + ulong threads_enabled; /* mask of threads enabled in this group */ + uint base; /* first thread in this group */ + uint count; /* number of threads in this group */ + ulong tgid_bit; /* bit corresponding to the tgroup ID */ + + /* pad to cache line (64B) */ + char __pad[0]; /* unused except to check remaining room */ + char __end[0] __attribute__((aligned(64))); +}; + +/* This structure describes the group-specific context (e.g. active threads + * etc). It uses one cache line per thread to limit false sharing. + */ +struct tgroup_ctx { + ulong threads_harmless; /* mask of threads that are not modifying anything */ + ulong threads_idle; /* mask of threads idling in the poller */ + ulong stopping_threads; /* mask of threads currently stopping */ + + struct eb_root timers; /* wait queue (sorted timers tree, global, accessed under wq_lock) */ + + uint niced_tasks; /* number of niced tasks in this group's run queues */ + + /* pad to cache line (64B) */ + char __pad[0]; /* unused except to check remaining room */ + char __end[0] __attribute__((aligned(64))); +}; + +/* This structure describes all the per-thread info we need. When threads are + * disabled, it contains the same info for the single running thread. This is + * stable across all of a thread's life, and is being pointed to by the + * thread-local "ti" pointer. + */ +struct thread_info { + const struct tgroup_info *tg; /* config of the thread-group this thread belongs to */ + struct tgroup_ctx *tg_ctx; /* context of the thread-group this thread belongs to */ + uint tid, ltid; /* process-wide and group-wide thread ID (start at 0) */ + ulong ltid_bit; /* bit masks for the tid/ltid */ + uint tgid; /* ID of the thread group this thread belongs to (starts at 1; 0=unset) */ + /* 32-bit hole here */ + + ullong pth_id; /* the pthread_t cast to a ullong */ + void *stack_top; /* the top of the stack when entering the thread */ + + /* pad to cache line (64B) */ + char __pad[0]; /* unused except to check remaining room */ + char __end[0] __attribute__((aligned(64))); +}; + +/* This structure describes all the per-thread context we need. This is + * essentially the scheduler-specific stuff and a few important per-thread + * lists that need to be thread-local. We take care of splitting this into + * separate cache lines. + */ +struct thread_ctx { + // first and second cache lines on 64 bits: thread-local operations only. + struct eb_root timers; /* tree constituting the per-thread wait queue */ + struct eb_root rqueue; /* tree constituting the per-thread run queue */ + struct task *current; /* current task (not tasklet) */ + int current_queue; /* points to current tasklet list being run, -1 if none */ + unsigned int nb_tasks; /* number of tasks allocated on this thread */ + uint8_t tl_class_mask; /* bit mask of non-empty tasklets classes */ + + // 7 bytes hole here + struct list pool_lru_head; /* oldest objects in thread-local pool caches */ + struct list buffer_wq; /* buffer waiters */ + struct list streams; /* list of streams attached to this thread */ + struct list quic_conns; /* list of active quic-conns attached to this thread */ + struct list quic_conns_clo; /* list of closing quic-conns attached to this thread */ + struct list queued_checks; /* checks waiting for a connection slot */ + unsigned int nb_rhttp_conns; /* count of current conns used for active reverse HTTP */ + + ALWAYS_ALIGN(2*sizeof(void*)); + struct list tasklets[TL_CLASSES]; /* tasklets (and/or tasks) to run, by class */ + + // third cache line here on 64 bits: accessed mostly using atomic ops + ALWAYS_ALIGN(64); + struct mt_list shared_tasklet_list; /* Tasklet to be run, woken up by other threads */ + unsigned int rqueue_ticks; /* Insertion counter for the run queue */ + unsigned int rq_total; /* total size of the run queue, prio_tree + tasklets */ + int tasks_in_list; /* Number of tasks in the per-thread tasklets list */ + uint idle_pct; /* idle to total ratio over last sample (percent) */ + uint flags; /* thread flags, TH_FL_*, atomic! */ + uint active_checks; /* number of active health checks on this thread, incl migrated */ + + uint32_t sched_wake_date; /* current task/tasklet's wake date or 0 */ + uint32_t sched_call_date; /* current task/tasklet's call date (valid if sched_wake_date > 0) */ + struct sched_activity *sched_profile_entry; /* profile entry in use by the current task/tasklet, only if sched_wake_date>0 */ + + uint64_t prev_cpu_time; /* previous per thread CPU time */ + uint64_t prev_mono_time; /* previous system wide monotonic time */ + + struct eb_root rqueue_shared; /* run queue fed by other threads */ + __decl_thread(HA_SPINLOCK_T rqsh_lock); /* lock protecting the shared runqueue */ + + struct freq_ctr out_32bps; /* #of 32-byte blocks emitted per second */ + uint running_checks; /* number of health checks currently running on this thread */ + + unsigned long long out_bytes; /* total #of bytes emitted */ + unsigned long long spliced_out_bytes; /* total #of bytes emitted though a kernel pipe */ + struct buffer *thread_dump_buffer; /* NULL out of dump, valid during a dump, 0x01 once done */ + + ALWAYS_ALIGN(128); +}; + + +#endif /* _HAPROXY_TINFO_T_H */ diff --git a/include/haproxy/tinfo.h b/include/haproxy/tinfo.h new file mode 100644 index 0000000..ddb26aa --- /dev/null +++ b/include/haproxy/tinfo.h @@ -0,0 +1,120 @@ +/* + * include/haproxy/tinfo.h + * Export of ha_thread_info[] and ti pointer. + * + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TINFO_H +#define _HAPROXY_TINFO_H + +#include <haproxy/api.h> +#include <haproxy/tinfo-t.h> +#include <haproxy/intops.h> + +/* the structs are in thread.c */ +extern struct tgroup_info ha_tgroup_info[MAX_TGROUPS]; +extern THREAD_LOCAL const struct tgroup_info *tg; + +extern struct thread_info ha_thread_info[MAX_THREADS]; +extern THREAD_LOCAL const struct thread_info *ti; /* thread_info for the current thread */ + +extern struct tgroup_ctx ha_tgroup_ctx[MAX_TGROUPS]; +extern THREAD_LOCAL struct tgroup_ctx *tg_ctx; /* ha_tgroup_ctx for the current thread */ + +extern struct thread_ctx ha_thread_ctx[MAX_THREADS]; +extern THREAD_LOCAL struct thread_ctx *th_ctx; /* ha_thread_ctx for the current thread */ + +/* returns the number of threads set in set <ts>. */ +static inline int thread_set_count(const struct thread_set *ts) +{ + int i, n; + + /* iterating over tgroups guarantees to visit all possible threads, the + * opposite is not true. + */ + for (i = n = 0; i < MAX_TGROUPS; i++) + n += my_popcountl(ts->rel[i]); + return n; +} + +/* returns zero if the thread set <ts> has at least one thread set, + * otherwise non-zero. + */ +static inline int thread_set_is_empty(const struct thread_set *ts) +{ + int i; + + /* iterating over tgroups guarantees to visit all possible threads, the + * opposite is not true. + */ + for (i = 0; i < MAX_TGROUPS; i++) + if (ts->rel[i]) + return 0; + return 1; +} + +/* returns the number starting at 1 of the <n>th thread-group set in thread set + * <ts>, or zero if the set is empty or if thread numbers are only absolute. + * <n> starts at zero and corresponds to the number of non-empty groups to be + * skipped (i.e. 0 returns the first one). + */ +static inline int thread_set_nth_group(const struct thread_set *ts, int n) +{ + int i; + + if (ts->grps) { + for (i = 0; i < MAX_TGROUPS; i++) + if (ts->rel[i] && !n--) + return i + 1; + } + return 0; +} + +/* returns the thread mask of the <n>th assigned thread-group in the thread + * set <ts> for relative sets, the first thread mask at all in case of absolute + * sets, or zero if the set is empty. This is only used temporarily to ease the + * transition. <n> starts at zero and corresponds to the number of non-empty + * groups to be skipped (i.e. 0 returns the first one). + */ +static inline ulong thread_set_nth_tmask(const struct thread_set *ts, int n) +{ + int i; + + if (ts->grps) { + for (i = 0; i < MAX_TGROUPS; i++) + if (ts->rel[i] && !n--) + return ts->rel[i]; + } + return ts->abs[0]; +} + +/* Pins the thread set to the specified thread mask on group 1 (use ~0UL for + * all threads). This is for compatibility with some rare legacy code. If a + * "thread" directive on a bind line is parsed, this one will be overwritten. + */ +static inline void thread_set_pin_grp1(struct thread_set *ts, ulong mask) +{ + int i; + + ts->grps = 1; + ts->rel[0] = mask; + for (i = 1; i < MAX_TGROUPS; i++) + ts->rel[i] = 0; +} + +#endif /* _HAPROXY_TINFO_H */ diff --git a/include/haproxy/tools-t.h b/include/haproxy/tools-t.h new file mode 100644 index 0000000..32d8193 --- /dev/null +++ b/include/haproxy/tools-t.h @@ -0,0 +1,166 @@ +/* + * include/haproxy/tools-t.h + * This files contains some general purpose macros and structures. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TOOLS_T_H +#define _HAPROXY_TOOLS_T_H + +/* size used for max length of decimal representation of long long int. */ +#define NB_LLMAX_STR (sizeof("-9223372036854775807")-1) + +/* number of itoa_str entries */ +#define NB_ITOA_STR 16 + +/* maximum quoted string length (truncated above) */ +#define QSTR_SIZE 200 +#define NB_QSTR 10 + +/* returns 1 only if only zero or one bit is set in X, which means that X is a + * power of 2, and 0 otherwise */ +#define POWEROF2(x) (((x) & ((x)-1)) == 0) + +/* return an integer of type <ret> with only the highest bit set. <ret> may be + * both a variable or a type. + */ +#define MID_RANGE(ret) ((typeof(ret))1 << (8*sizeof(ret) - 1)) + +/* return the largest possible integer of type <ret>, with all bits set */ +#define MAX_RANGE(ret) (~(typeof(ret))0) + +/* DEFNULL() returns either the argument as-is, or NULL if absent. This is for + * use in macros arguments. + */ +#define DEFNULL(...) _FIRST_ARG(NULL, ##__VA_ARGS__, NULL) +#define _FIRST_ARG(a, b, ...) b + +/* options flags for parse_line() */ +#define PARSE_OPT_SHARP 0x00000001 // '#' ends the line +#define PARSE_OPT_BKSLASH 0x00000002 // '\' escapes chars +#define PARSE_OPT_SQUOTE 0x00000004 // "'" encloses a string +#define PARSE_OPT_DQUOTE 0x00000008 // '"' encloses a string +#define PARSE_OPT_ENV 0x00000010 // '$' is followed by environment variables +#define PARSE_OPT_INPLACE 0x00000020 // parse and tokenize in-place (src == dst) +#define PARSE_OPT_WORD_EXPAND 0x00000040 // '[*]' suffix to expand an environment variable as several individual arguments + +/* return error flags from parse_line() */ +#define PARSE_ERR_TOOLARGE 0x00000001 // result is too large for initial outlen +#define PARSE_ERR_TOOMANY 0x00000002 // more words than initial nbargs +#define PARSE_ERR_QUOTE 0x00000004 // unmatched quote (offending one at errptr) +#define PARSE_ERR_BRACE 0x00000008 // unmatched brace (offending one at errptr) +#define PARSE_ERR_HEX 0x00000010 // unparsable hex sequence (at errptr) +#define PARSE_ERR_VARNAME 0x00000020 // invalid variable name (at errptr) +#define PARSE_ERR_OVERLAP 0x00000040 // output overlaps with input, need to allocate +#define PARSE_ERR_WRONG_EXPAND 0x00000080 // unparsable word expansion sequence + +/* special return values for the time parser (parse_time_err()) */ +#define PARSE_TIME_UNDER ((char *)1) +#define PARSE_TIME_OVER ((char *)2) + +/* unit flags to pass to parse_time_err() */ +#define TIME_UNIT_US 0x0000 +#define TIME_UNIT_MS 0x0001 +#define TIME_UNIT_S 0x0002 +#define TIME_UNIT_MIN 0x0003 +#define TIME_UNIT_HOUR 0x0004 +#define TIME_UNIT_DAY 0x0005 +#define TIME_UNIT_MASK 0x0007 + +#define SEC 1 +#define MINUTE (60 * SEC) +#define HOUR (60 * MINUTE) +#define DAY (24 * HOUR) + +/* Address parsing options for use with str2sa_range() */ +#define PA_O_RESOLVE 0x00000001 /* do resolve the FQDN to an IP address */ +#define PA_O_PORT_OK 0x00000002 /* ports are supported */ +#define PA_O_PORT_MAND 0x00000004 /* ports are mandatory */ +#define PA_O_PORT_RANGE 0x00000008 /* port ranges are supported */ +#define PA_O_PORT_OFS 0x00000010 /* port offsets are supported */ +#define PA_O_SOCKET_FD 0x00000020 /* inherited socket FDs are supported */ +#define PA_O_RAW_FD 0x00000040 /* inherited raw FDs are supported (pipes, ttys, ...) */ +#define PA_O_DGRAM 0x00000080 /* the address can be used for a datagram socket (in or out) */ +#define PA_O_STREAM 0x00000100 /* the address can be used for streams (in or out) */ +#define PA_O_XPRT 0x00000200 /* transport protocols may be specified */ +#define PA_O_CONNECT 0x00000400 /* the protocol must have a ->connect method */ +#define PA_O_DEFAULT_DGRAM 0x00000800 /* by default, this address will be used for a datagram socket */ + +/* UTF-8 decoder status */ +#define UTF8_CODE_OK 0x00 +#define UTF8_CODE_OVERLONG 0x10 +#define UTF8_CODE_INVRANGE 0x20 +#define UTF8_CODE_BADSEQ 0x40 + +/* HAP_STRING() makes a string from a literal while HAP_XSTRING() first + * evaluates the argument and is suited to pass macros. + * + * They allow macros like PCRE_MAJOR to be defined without quotes, which + * is convenient for applications that want to test its value. + */ +#define HAP_STRING(...) #__VA_ARGS__ +#define HAP_XSTRING(...) HAP_STRING(__VA_ARGS__) + +/* operators to compare values. They're ordered that way so that the lowest bit + * serves as a negation for the test and contains all tests that are not equal. + */ +enum { + STD_OP_LE = 0, STD_OP_GT = 1, + STD_OP_EQ = 2, STD_OP_NE = 3, + STD_OP_GE = 4, STD_OP_LT = 5, +}; + +enum http_scheme { + SCH_HTTP, + SCH_HTTPS, +}; + +/* output format used by url2sa() */ +struct split_url { + enum http_scheme scheme; + const char *host; + int host_len; +}; + +/* generic structure associating a name and a value, for use in arrays */ +struct name_desc { + const char *name; + const char *desc; +}; + +struct net_addr { + int family; /* AF_INET or AF_INET6 if defined, AF_UNSET if undefined */ + union { + struct { + struct in_addr ip; + struct in_addr mask; + } v4; + struct { + struct in6_addr ip; + struct in6_addr mask; + } v6; + } addr; +}; + +/* holds socket and xprt types for a given address */ +struct net_addr_type { + int proto_type; // socket layer + int xprt_type; // transport layer +}; + +#endif /* _HAPROXY_TOOLS_T_H */ diff --git a/include/haproxy/tools.h b/include/haproxy/tools.h new file mode 100644 index 0000000..3726f63 --- /dev/null +++ b/include/haproxy/tools.h @@ -0,0 +1,1179 @@ +/* + * include/haproxy/tools.h + * This files contains some general purpose functions and macros. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TOOLS_H +#define _HAPROXY_TOOLS_H + +#ifdef USE_BACKTRACE +#define _GNU_SOURCE +#include <execinfo.h> +#endif + +#include <string.h> +#include <stdio.h> +#include <time.h> +#include <stdarg.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <netinet/in.h> +#include <arpa/inet.h> +#include <import/eb32sctree.h> +#include <import/eb32tree.h> +#include <haproxy/api.h> +#include <haproxy/chunk.h> +#include <haproxy/intops.h> +#include <haproxy/namespace-t.h> +#include <haproxy/protocol-t.h> +#include <haproxy/tools-t.h> + +/****** string-specific macros and functions ******/ +/* if a > max, then bound <a> to <max>. The macro returns the new <a> */ +#define UBOUND(a, max) ({ typeof(a) b = (max); if ((a) > b) (a) = b; (a); }) + +/* if a < min, then bound <a> to <min>. The macro returns the new <a> */ +#define LBOUND(a, min) ({ typeof(a) b = (min); if ((a) < b) (a) = b; (a); }) + +#define SWAP(a, b) do { typeof(a) t; t = a; a = b; b = t; } while(0) + +/* use if you want to return a simple hash. Key 0 doesn't hash. */ +#define HA_ANON_STR(key, str) hash_anon(key, str, "", "") + +/* use if you want to return a hash like : ID('hash'). Key 0 doesn't hash. */ +#define HA_ANON_ID(key, str) hash_anon(key, str, "ID(", ")") + +/* use if you want to return a hash like : PATH('hash'). Key 0 doesn't hash. */ +#define HA_ANON_PATH(key, str) hash_anon(key, str, "PATH(", ")") + +/* use only in a function that contains an appctx (key comes from appctx). */ +#define HA_ANON_CLI(str) hash_anon(appctx->cli_anon_key, str, "", "") + + +/* + * copies at most <size-1> chars from <src> to <dst>. Last char is always + * set to 0, unless <size> is 0. The number of chars copied is returned + * (excluding the terminating zero). + * This code has been optimized for size and speed : on x86, it's 45 bytes + * long, uses only registers, and consumes only 4 cycles per char. + */ +extern int strlcpy2(char *dst, const char *src, int size); + +/* + * This function simply returns a locally allocated string containing + * the ascii representation for number 'n' in decimal. + */ +extern THREAD_LOCAL int itoa_idx; /* index of next itoa_str to use */ +extern THREAD_LOCAL char itoa_str[][171]; +extern int build_is_static; +extern char *ultoa_r(unsigned long n, char *buffer, int size); +extern char *lltoa_r(long long int n, char *buffer, int size); +extern char *sltoa_r(long n, char *buffer, int size); +extern const char *ulltoh_r(unsigned long long n, char *buffer, int size); +size_t flt_trim(char *buffer, size_t num_start, size_t len); +char *ftoa_r(double n, char *buffer, int size); +static inline const char *ultoa(unsigned long n) +{ + return ultoa_r(n, itoa_str[0], sizeof(itoa_str[0])); +} + +/* + * unsigned long long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *ulltoa(unsigned long long n, char *dst, size_t size); + + +/* + * unsigned long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *ultoa_o(unsigned long n, char *dst, size_t size); + +/* + * signed long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *ltoa_o(long int n, char *dst, size_t size); + +/* + * signed long long ASCII representation + * + * return the last char '\0' or NULL if no enough + * space in dst + */ +char *lltoa(long long n, char *dst, size_t size); + +/* + * write a ascii representation of a unsigned into dst, + * return a pointer to the last character + * Pad the ascii representation with '0', using size. + */ +char *utoa_pad(unsigned int n, char *dst, size_t size); + +/* + * This function simply returns a locally allocated string containing the ascii + * representation for number 'n' in decimal, unless n is 0 in which case it + * returns the alternate string (or an empty string if the alternate string is + * NULL). It use is intended for limits reported in reports, where it's + * desirable not to display anything if there is no limit. Warning! it shares + * the same vector as ultoa_r(). + */ +extern const char *limit_r(unsigned long n, char *buffer, int size, const char *alt); + +/* returns a locally allocated string containing the ASCII representation of + * the number 'n' in decimal. Up to NB_ITOA_STR calls may be used in the same + * function call (eg: printf), shared with the other similar functions making + * use of itoa_str[]. + */ +static inline const char *U2A(unsigned long n) +{ + const char *ret = ultoa_r(n, itoa_str[itoa_idx], sizeof(itoa_str[0])); + if (++itoa_idx >= NB_ITOA_STR) + itoa_idx = 0; + return ret; +} + +/* returns a locally allocated string containing the HTML representation of + * the number 'n' in decimal. Up to NB_ITOA_STR calls may be used in the same + * function call (eg: printf), shared with the other similar functions making + * use of itoa_str[]. + */ +static inline const char *U2H(unsigned long long n) +{ + const char *ret = ulltoh_r(n, itoa_str[itoa_idx], sizeof(itoa_str[0])); + if (++itoa_idx >= NB_ITOA_STR) + itoa_idx = 0; + return ret; +} + +/* returns a locally allocated string containing the ASCII representation of + * the number 'n' in decimal. Up to NB_ITOA_STR calls may be used in the same + * function call (eg: printf), shared with the other similar functions making + * use of itoa_str[]. + */ +static inline const char *F2A(double n) +{ + const char *ret = ftoa_r(n, itoa_str[itoa_idx], sizeof(itoa_str[0])); + if (++itoa_idx >= NB_ITOA_STR) + itoa_idx = 0; + return ret; +} + +/* returns a locally allocated string containing the HTML representation of + * the number 'n' in decimal. Up to NB_ITOA_STR calls may be used in the same + * function call (eg: printf), shared with the other similar functions making + * use of itoa_str[]. + */ +static inline const char *F2H(double n) +{ + const char *ret = ftoa_r(n, itoa_str[itoa_idx], sizeof(itoa_str[0])); + if (++itoa_idx >= NB_ITOA_STR) + itoa_idx = 0; + return ret; +} + +/* returns a locally allocated string containing the ASCII representation of + * the number 'n' in decimal. Up to NB_ITOA_STR calls may be used in the same + * function call (eg: printf), shared with the other similar functions making + * use of itoa_str[]. + */ +static inline const char *LIM2A(unsigned long n, const char *alt) +{ + const char *ret = limit_r(n, itoa_str[itoa_idx], sizeof(itoa_str[0]), alt); + if (++itoa_idx >= NB_ITOA_STR) + itoa_idx = 0; + return ret; +} + +/* returns a locally allocated string containing the quoted encoding of the + * input string. The output may be truncated to QSTR_SIZE chars, but it is + * guaranteed that the string will always be properly terminated. Quotes are + * encoded by doubling them as is commonly done in CSV files. QSTR_SIZE must + * always be at least 4 chars. + */ +const char *qstr(const char *str); + +/* returns <str> or its quote-encoded equivalent if it contains at least one + * quote or a comma. This is aimed at build CSV-compatible strings. + */ +static inline const char *cstr(const char *str) +{ + const char *p = str; + + while (*p) { + if (*p == ',' || *p == '"') + return qstr(str); + p++; + } + return str; +} + +/* + * Returns non-zero if character <s> is a hex digit (0-9, a-f, A-F), else zero. + */ +extern int ishex(char s); + +/* + * Checks <name> for invalid characters. Valid chars are [A-Za-z0-9_:.-]. If an + * invalid character is found, a pointer to it is returned. If everything is + * fine, NULL is returned. + */ +extern const char *invalid_char(const char *name); + +/* + * Checks <name> for invalid characters. Valid chars are [A-Za-z0-9_.-]. + * If an invalid character is found, a pointer to it is returned. + * If everything is fine, NULL is returned. + */ +extern const char *invalid_domainchar(const char *name); + +/* + * Checks <name> for invalid characters. Valid chars are [A-Za-z_.-]. + * If an invalid character is found, a pointer to it is returned. + * If everything is fine, NULL is returned. + */ +extern const char *invalid_prefix_char(const char *name); + +/* returns true if <c> is an identifier character, that is, a digit, a letter, + * or '-', '+', '_', ':' or '.'. This is usable for proxy names, server names, + * ACL names, sample fetch names, and converter names. + */ +static inline int is_idchar(char c) +{ + return isalnum((unsigned char)c) || + c == '.' || c == '_' || c == '-' || c == '+' || c == ':'; +} + +/* + * converts <str> to a locally allocated struct sockaddr_storage *, and a + * port range consisting in two integers. The low and high end are always set + * even if the port is unspecified, in which case (0,0) is returned. The low + * port is set in the sockaddr. Thus, it is enough to check the size of the + * returned range to know if an array must be allocated or not. The format is + * "addr[:[port[-port]]]", where "addr" can be a dotted IPv4 address, an IPv6 + * address, a host name, or empty or "*" to indicate INADDR_ANY. If an IPv6 + * address wants to ignore port, it must be terminated by a trailing colon (':'). + * The IPv6 '::' address is IN6ADDR_ANY, so in order to bind to a given port on + * IPv6, use ":::port". NULL is returned if the host part cannot be resolved. + * If <pfx> is non-null, it is used as a string prefix before any path-based + * address (typically the path to a unix socket). + */ +struct sockaddr_storage *str2sa_range(const char *str, int *port, int *low, int *high, int *fd, + struct protocol **proto, struct net_addr_type *sa_type, + char **err, const char *pfx, char **fqdn, unsigned int opts); + + +/* converts <addr> and <port> into a string representation of the address and port. This is sort + * of an inverse of str2sa_range, with some restrictions. The supported families are AF_INET, + * AF_INET6, AF_UNIX, and AF_CUST_SOCKPAIR. If the family is unsopported NULL is returned. + * If map_ports is true, then the sign of the port is included in the output, to indicate it is + * relative to the incoming port. AF_INET and AF_INET6 will be in the form "<addr>:<port>". + * AF_UNIX will either be just the path (if using a pathname) or "abns@<path>" if it is abstract. + * AF_CUST_SOCKPAIR will be of the form "sockpair@<fd>". + * + * The returned char* is allocated, and it is the responsibility of the caller to free it. + */ +char *sa2str(const struct sockaddr_storage *addr, int port, int map_ports); + +/* converts <str> to a struct in_addr containing a network mask. It can be + * passed in dotted form (255.255.255.0) or in CIDR form (24). It returns 1 + * if the conversion succeeds otherwise zero. + */ +int str2mask(const char *str, struct in_addr *mask); + +/* converts <str> to a struct in6_addr containing a network mask. It can be + * passed in quadruplet form (ffff:ffff::) or in CIDR form (64). It returns 1 + * if the conversion succeeds otherwise zero. + */ +int str2mask6(const char *str, struct in6_addr *mask); + +/* convert <cidr> to struct in_addr <mask>. It returns 1 if the conversion + * succeeds otherwise non-zero. + */ +int cidr2dotted(int cidr, struct in_addr *mask); + +/* + * converts <str> to two struct in_addr* which must be pre-allocated. + * The format is "addr[/mask]", where "addr" cannot be empty, and mask + * is optional and either in the dotted or CIDR notation. + * Note: "addr" can also be a hostname. Returns 1 if OK, 0 if error. + */ +int str2net(const char *str, int resolve, struct in_addr *addr, struct in_addr *mask); + +/* str2ip and str2ip2: + * + * converts <str> to a struct sockaddr_storage* provided by the caller. The + * caller must have zeroed <sa> first, and may have set sa->ss_family to force + * parse a specific address format. If the ss_family is 0 or AF_UNSPEC, then + * the function tries to guess the address family from the syntax. If the + * family is forced and the format doesn't match, an error is returned. The + * string is assumed to contain only an address, no port. The address can be a + * dotted IPv4 address, an IPv6 address, a host name, or empty or "*" to + * indicate INADDR_ANY. NULL is returned if the host part cannot be resolved. + * The return address will only have the address family and the address set, + * all other fields remain zero. The string is not supposed to be modified. + * The IPv6 '::' address is IN6ADDR_ANY. + * + * str2ip2: + * + * If <resolve> is set, this function try to resolve DNS, otherwise, it returns + * NULL result. + */ +struct sockaddr_storage *str2ip2(const char *str, struct sockaddr_storage *sa, int resolve); +static inline struct sockaddr_storage *str2ip(const char *str, struct sockaddr_storage *sa) +{ + return str2ip2(str, sa, 1); +} + +/* + * converts <str> to two struct in6_addr* which must be pre-allocated. + * The format is "addr[/mask]", where "addr" cannot be empty, and mask + * is an optional number of bits (128 being the default). + * Returns 1 if OK, 0 if error. + */ +int str62net(const char *str, struct in6_addr *addr, unsigned char *mask); + +/* + * Parse IP address found in url. + */ +int url2ipv4(const char *addr, struct in_addr *dst); + +/* + * Resolve destination server from URL. Convert <str> to a sockaddr_storage*. + */ +int url2sa(const char *url, int ulen, struct sockaddr_storage *addr, struct split_url *out); + +/* Tries to convert a sockaddr_storage address to text form. Upon success, the + * address family is returned so that it's easy for the caller to adapt to the + * output format. Zero is returned if the address family is not supported. -1 + * is returned upon error, with errno set. AF_INET, AF_INET6 and AF_UNIX are + * supported. + */ +int addr_to_str(const struct sockaddr_storage *addr, char *str, int size); + +/* Tries to convert a sockaddr_storage port to text form. Upon success, the + * address family is returned so that it's easy for the caller to adapt to the + * output format. Zero is returned if the address family is not supported. -1 + * is returned upon error, with errno set. AF_INET, AF_INET6 and AF_UNIX are + * supported. + */ +int port_to_str(const struct sockaddr_storage *addr, char *str, int size); + +/* check if the given address is local to the system or not. It will return + * -1 when it's not possible to know, 0 when the address is not local, 1 when + * it is. We don't want to iterate over all interfaces for this (and it is not + * portable). So instead we try to bind in UDP to this address on a free non + * privileged port and to connect to the same address, port 0 (connect doesn't + * care). If it succeeds, we own the address. Note that non-inet addresses are + * considered local since they're most likely AF_UNIX. + */ +int addr_is_local(const struct netns_entry *ns, + const struct sockaddr_storage *orig); + +/* will try to encode the string <string> replacing all characters tagged in + * <map> with the hexadecimal representation of their ASCII-code (2 digits) + * prefixed by <escape>, and will store the result between <start> (included) + * and <stop> (excluded), and will always terminate the string with a '\0' + * before <stop>. The position of the '\0' is returned if the conversion + * completes. If bytes are missing between <start> and <stop>, then the + * conversion will be incomplete and truncated. If <stop> <= <start>, the '\0' + * cannot even be stored so we return <start> without writing the 0. + * The input string must also be zero-terminated. + */ +extern const char hextab[]; +extern long query_encode_map[]; +char *encode_string(char *start, char *stop, + const char escape, const long *map, + const char *string); + +/* + * Same behavior, except that it encodes chunk <chunk> instead of a string. + */ +char *encode_chunk(char *start, char *stop, + const char escape, const long *map, + const struct buffer *chunk); + +/* + * Tries to prefix characters tagged in the <map> with the <escape> + * character. The input <string> is processed until string_stop + * is reached or NULL-byte is encountered. The result will + * be stored between <start> (included) and <stop> (excluded). This + * function will always try to terminate the resulting string with a '\0' + * before <stop>, and will return its position if the conversion + * completes. + */ +char *escape_string(char *start, char *stop, + const char escape, const long *map, + const char *string, const char *string_stop); + +/* Check a string for using it in a CSV output format. If the string contains + * one of the following four char <">, <,>, CR or LF, the string is + * encapsulated between <"> and the <"> are escaped by a <""> sequence. + * <str> is the input string to be escaped. The function assumes that + * the input string is null-terminated. + * + * If <quote> is 0, the result is returned escaped but without double quote. + * It is useful if the escaped string is used between double quotes in the + * format. + * + * printf("..., \"%s\", ...\r\n", csv_enc(str, 0, 0, &trash)); + * + * If <quote> is 1, the converter puts the quotes only if any character is + * escaped. If <quote> is 2, the converter always puts the quotes. + * + * If <oneline> is not 0, CRs are skipped and LFs are replaced by spaces. + * This re-format multi-lines strings to only one line. The purpose is to + * allow a line by line parsing but also to keep the output compliant with + * the CLI witch uses LF to defines the end of the response. + * + * If <oneline> is 2, In addition to previous action, the trailing spaces are + * removed. + * + * <output> is a struct chunk used for storing the output string. + * + * The function returns the converted string on its output. If an error + * occurs, the function returns an empty string. This type of output is useful + * for using the function directly as printf() argument. + * + * If the output buffer is too short to contain the input string, the result + * is truncated. + * + * This function appends the encoding to the existing output chunk. Please + * use csv_enc() instead if you want to replace the output chunk. + */ +const char *csv_enc_append(const char *str, int quote, int online, + struct buffer *output); + +/* same as above but the output chunk is reset first */ +static inline const char *csv_enc(const char *str, int quote, int oneline, + struct buffer *output) +{ + chunk_reset(output); + return csv_enc_append(str, quote, oneline, output); +} + +/* Decode an URL-encoded string in-place. The resulting string might + * be shorter. If some forbidden characters are found, the conversion is + * aborted, the string is truncated before the issue and non-zero is returned, + * otherwise the operation returns non-zero indicating success. + * If the 'in_form' argument is non-nul the string is assumed to be part of + * an "application/x-www-form-urlencoded" encoded string, and the '+' will be + * turned to a space. If it's zero, this will only be done after a question + * mark ('?'). + */ +int url_decode(char *string, int in_form); + +unsigned int inetaddr_host(const char *text); +unsigned int inetaddr_host_lim(const char *text, const char *stop); +unsigned int inetaddr_host_lim_ret(char *text, char *stop, char **ret); + +/* Function that hashes or not a string according to the anonymizing key (scramble). */ +const char *hash_anon(uint32_t scramble, const char *string2hash, const char *prefix, const char *suffix); + +/* Function that hashes or not an ip according to the ipstring entered */ +const char * hash_ipanon(uint32_t scramble, char *ipstring, int hasport); + +static inline char *cut_crlf(char *s) { + + while (*s != '\r' && *s != '\n') { + char *p = s++; + + if (!*p) + return p; + } + + *s++ = '\0'; + + return s; +} + +static inline char *ltrim(char *s, char c) { + + if (c) + while (*s == c) + s++; + + return s; +} + +static inline char *rtrim(char *s, char c) { + + char *p = s + strlen(s); + + while (p-- > s) + if (*p == c) + *p = '\0'; + else + break; + + return s; +} + +static inline char *alltrim(char *s, char c) { + + rtrim(s, c); + + return ltrim(s, c); +} + +/* This function converts the time_t value <now> into a broken out struct tm + * which must be allocated by the caller. It is highly recommended to use this + * function instead of localtime() because that one requires a time_t* which + * is not always compatible with tv_sec depending on OS/hardware combinations. + */ +static inline void get_localtime(const time_t now, struct tm *tm) +{ + localtime_r(&now, tm); +} + +/* This function converts the time_t value <now> into a broken out struct tm + * which must be allocated by the caller. It is highly recommended to use this + * function instead of gmtime() because that one requires a time_t* which + * is not always compatible with tv_sec depending on OS/hardware combinations. + */ +static inline void get_gmtime(const time_t now, struct tm *tm) +{ + gmtime_r(&now, tm); +} + +/* Counts a number of elapsed days since 01/01/0000 based solely on elapsed + * years and assuming the regular rule for leap years applies. It's fake but + * serves as a temporary origin. It's worth remembering that it's the first + * year of each period that is leap and not the last one, so for instance year + * 1 sees 366 days since year 0 was leap. For this reason we have to apply + * modular arithmetic which is why we offset the year by 399 before + * subtracting the excess at the end. No overflow here before ~11.7 million + * years. + */ +static inline unsigned int days_since_zero(unsigned int y) +{ + return y * 365 + (y + 399) / 4 - (y + 399) / 100 + (y + 399) / 400 + - 399 / 4 + 399 / 100; +} + +/* Returns the number of seconds since 01/01/1970 0:0:0 GMT for GMT date <tm>. + * It is meant as a portable replacement for timegm() for use with valid inputs. + * Returns undefined results for invalid dates (eg: months out of range 0..11). + */ +extern time_t my_timegm(const struct tm *tm); + +/* This function parses a time value optionally followed by a unit suffix among + * "d", "h", "m", "s", "ms" or "us". It converts the value into the unit + * expected by the caller. The computation does its best to avoid overflows. + * The value is returned in <ret> if everything is fine, and a NULL is returned + * by the function. In case of error, a pointer to the error is returned and + * <ret> is left untouched. + */ +extern const char *parse_time_err(const char *text, unsigned *ret, unsigned unit_flags); +extern const char *parse_size_err(const char *text, unsigned *ret); + +/* + * Parse binary string written in hexadecimal (source) and store the decoded + * result into binstr and set binstrlen to the length of binstr. Memory for + * binstr is allocated by the function. In case of error, returns 0 with an + * error message in err. + */ +int parse_binary(const char *source, char **binstr, int *binstrlen, char **err); + +/* copies at most <n> characters from <src> and always terminates with '\0' */ +char *my_strndup(const char *src, int n); + +/* + * search needle in haystack + * returns the pointer if found, returns NULL otherwise + */ +const void *my_memmem(const void *, size_t, const void *, size_t); + +/* get length of the initial segment consisting entirely of bytes within a given + * mask + */ +size_t my_memspn(const void *, size_t, const void *, size_t); + +/* get length of the initial segment consisting entirely of bytes not within a + * given mask + */ +size_t my_memcspn(const void *, size_t, const void *, size_t); + +/* This function returns the first unused key greater than or equal to <key> in + * ID tree <root>. Zero is returned if no place is found. + */ +unsigned int get_next_id(struct eb_root *root, unsigned int key); + +/* dump the full tree to <file> in DOT format for debugging purposes. Will + * optionally highlight node <subj> if found, depending on operation <op> : + * 0 : nothing + * >0 : insertion, node/leaf are surrounded in red + * <0 : removal, node/leaf are dashed with no background + * Will optionally add "desc" as a label on the graph if set and non-null. + */ +void eb32sc_to_file(FILE *file, struct eb_root *root, const struct eb32sc_node *subj, + int op, const char *desc); + +/* same but for ebmb */ +void ebmb_to_file(FILE *file, struct eb_root *root, const struct ebmb_node *subj, + int op, const char *desc); + +/* This function compares a sample word possibly followed by blanks to another + * clean word. The compare is case-insensitive. 1 is returned if both are equal, + * otherwise zero. This intends to be used when checking HTTP headers for some + * values. + */ +int word_match(const char *sample, int slen, const char *word, int wlen); + +/* Convert a fixed-length string to an IP address. Returns 0 in case of error, + * or the number of chars read in case of success. + */ +int buf2ip(const char *buf, size_t len, struct in_addr *dst); +int buf2ip6(const char *buf, size_t len, struct in6_addr *dst); + +/* To be used to quote config arg positions. Returns the string at <ptr> + * surrounded by simple quotes if <ptr> is valid and non-empty, or "end of line" + * if ptr is NULL or empty. The string is locally allocated. + */ +const char *quote_arg(const char *ptr); + +/* returns an operator among STD_OP_* for string <str> or < 0 if unknown */ +int get_std_op(const char *str); + +/* sets the address family to AF_UNSPEC so that is_addr() does not match */ +static inline void clear_addr(struct sockaddr_storage *addr) +{ + addr->ss_family = AF_UNSPEC; +} + +/* returns non-zero if addr has a valid and non-null IPv4 or IPv6 address, + * otherwise zero. + */ +static inline int is_inet_addr(const struct sockaddr_storage *addr) +{ + int i; + + switch (addr->ss_family) { + case AF_INET: + return *(int *)&((struct sockaddr_in *)addr)->sin_addr; + case AF_INET6: + for (i = 0; i < sizeof(struct in6_addr) / sizeof(int); i++) + if (((int *)&((struct sockaddr_in6 *)addr)->sin6_addr)[i] != 0) + return ((int *)&((struct sockaddr_in6 *)addr)->sin6_addr)[i]; + } + return 0; +} + +/* returns non-zero if addr has a valid and non-null IPv4 or IPv6 address, + * or is a unix address, otherwise returns zero. + */ +static inline int is_addr(const struct sockaddr_storage *addr) +{ + if (addr->ss_family == AF_UNIX || addr->ss_family == AF_CUST_SOCKPAIR) + return 1; + else + return is_inet_addr(addr); +} + +/* returns port in network byte order */ +static inline int get_net_port(const struct sockaddr_storage *addr) +{ + switch (addr->ss_family) { + case AF_INET: + return ((struct sockaddr_in *)addr)->sin_port; + case AF_INET6: + return ((struct sockaddr_in6 *)addr)->sin6_port; + } + return 0; +} + +/* returns port in host byte order */ +static inline int get_host_port(const struct sockaddr_storage *addr) +{ + switch (addr->ss_family) { + case AF_INET: + return ntohs(((struct sockaddr_in *)addr)->sin_port); + case AF_INET6: + return ntohs(((struct sockaddr_in6 *)addr)->sin6_port); + } + return 0; +} + +/* returns address len for <addr>'s family, 0 for unknown families */ +static inline int get_addr_len(const struct sockaddr_storage *addr) +{ + switch (addr->ss_family) { + case AF_INET: + return sizeof(struct sockaddr_in); + case AF_INET6: + return sizeof(struct sockaddr_in6); + case AF_UNIX: + return sizeof(struct sockaddr_un); + } + return 0; +} + +/* set port in host byte order */ +static inline int set_net_port(struct sockaddr_storage *addr, int port) +{ + switch (addr->ss_family) { + case AF_INET: + ((struct sockaddr_in *)addr)->sin_port = port; + break; + case AF_INET6: + ((struct sockaddr_in6 *)addr)->sin6_port = port; + break; + } + return 0; +} + +/* set port in network byte order */ +static inline int set_host_port(struct sockaddr_storage *addr, int port) +{ + switch (addr->ss_family) { + case AF_INET: + ((struct sockaddr_in *)addr)->sin_port = htons(port); + break; + case AF_INET6: + ((struct sockaddr_in6 *)addr)->sin6_port = htons(port); + break; + } + return 0; +} + +/* Convert mask from bit length form to in_addr form. + * This function never fails. + */ +void len2mask4(int len, struct in_addr *addr); + +/* Convert mask from bit length form to in6_addr form. + * This function never fails. + */ +void len2mask6(int len, struct in6_addr *addr); + +/* Return true if IPv4 address is part of the network */ +extern int in_net_ipv4(const void *addr, const struct in_addr *mask, const struct in_addr *net); + +/* Return true if IPv6 address is part of the network */ +extern int in_net_ipv6(const void *addr, const struct in6_addr *mask, const struct in6_addr *net); + +/* Map IPv4 address on IPv6 address, as specified in RFC 3513. */ +extern void v4tov6(struct in6_addr *sin6_addr, struct in_addr *sin_addr); + +/* Map IPv6 address on IPv4 address, as specified in RFC 3513. + * Return true if conversion is possible and false otherwise. + */ +extern int v6tov4(struct in_addr *sin_addr, struct in6_addr *sin6_addr); + +/* compare two struct sockaddr_storage, including port if <check_port> is true, + * and return: + * 0 (true) if the addr is the same in both + * 1 (false) if the addr is not the same in both + * -1 (unable) if one of the addr is not AF_INET* + */ +int ipcmp(const struct sockaddr_storage *ss1, const struct sockaddr_storage *ss2, int check_port); + +/* compare a struct sockaddr_storage to a struct net_addr and return : + * 0 (true) if <addr> is matching <net> + * 1 (false) if <addr> is not matching <net> + * -1 (unable) if <addr> or <net> is not AF_INET* + */ +int ipcmp2net(const struct sockaddr_storage *addr, const struct net_addr *net); + +/* copy ip from <source> into <dest> + * the caller must clear <dest> before calling. + * Returns a pointer to the destination + */ +struct sockaddr_storage *ipcpy(const struct sockaddr_storage *source, struct sockaddr_storage *dest); + +char *human_time(int t, short hz_div); + +extern const char *monthname[]; + +/* date2str_log: write a date in the format : + * sprintf(str, "%02d/%s/%04d:%02d:%02d:%02d.%03d", + * tm.tm_mday, monthname[tm.tm_mon], tm.tm_year+1900, + * tm.tm_hour, tm.tm_min, tm.tm_sec, (int)date.tv_usec/1000); + * + * without using sprintf. return a pointer to the last char written (\0) or + * NULL if there isn't enough space. + */ +char *date2str_log(char *dest, const struct tm *tm, const struct timeval *date, size_t size); + +/* Return the GMT offset for a specific local time. + * Both t and tm must represent the same time. + * The string returned has the same format as returned by strftime(... "%z", tm). + * Offsets are kept in an internal cache for better performances. + */ +const char *get_gmt_offset(time_t t, struct tm *tm); + +/* gmt2str_log: write a date in the format : + * "%02d/%s/%04d:%02d:%02d:%02d +0000" without using snprintf + * return a pointer to the last char written (\0) or + * NULL if there isn't enough space. + */ +char *gmt2str_log(char *dst, struct tm *tm, size_t size); + +/* localdate2str_log: write a date in the format : + * "%02d/%s/%04d:%02d:%02d:%02d +0000(local timezone)" without using snprintf + * Both t and tm must represent the same time. + * return a pointer to the last char written (\0) or + * NULL if there isn't enough space. + */ +char *localdate2str_log(char *dst, time_t t, struct tm *tm, size_t size); + +/* These 3 functions parses date string and fills the + * corresponding broken-down time in <tm>. In success case, + * it returns 1, otherwise, it returns 0. + */ +int parse_http_date(const char *date, int len, struct tm *tm); +int parse_imf_date(const char *date, int len, struct tm *tm); +int parse_rfc850_date(const char *date, int len, struct tm *tm); +int parse_asctime_date(const char *date, int len, struct tm *tm); +int print_time_short(struct buffer *out, const char *pfx, uint64_t ns, const char *sfx); + +/* Dynamically allocates a string of the proper length to hold the formatted + * output. NULL is returned on error. The caller is responsible for freeing the + * memory area using free(). The resulting string is returned in <out> if the + * pointer is not NULL. A previous version of <out> might be used to build the + * new string, and it will be freed before returning if it is not NULL, which + * makes it possible to build complex strings from iterative calls without + * having to care about freeing intermediate values, as in the example below : + * + * memprintf(&err, "invalid argument: '%s'", arg); + * ... + * memprintf(&err, "parser said : <%s>\n", *err); + * ... + * free(*err); + * + * This means that <err> must be initialized to NULL before first invocation. + * The return value also holds the allocated string, which eases error checking + * and immediate consumption. If the output pointer is not used, NULL must be + * passed instead and it will be ignored. The returned message will then also + * be NULL so that the caller does not have to bother with freeing anything. + * + * It is also convenient to use it without any free except the last one : + * err = NULL; + * if (!fct1(err)) report(*err); + * if (!fct2(err)) report(*err); + * if (!fct3(err)) report(*err); + * free(*err); + * + * memprintf relies on memvprintf. This last version can be called from any + * function with variadic arguments. + */ +char *memvprintf(char **out, const char *format, va_list args) + __attribute__ ((format(printf, 2, 0))); + +char *memprintf(char **out, const char *format, ...) + __attribute__ ((format(printf, 2, 3))); + +/* Used to add <level> spaces before each line of <out>, unless there is only one line. + * The input argument is automatically freed and reassigned. The result will have to be + * freed by the caller. + * Example of use : + * parse(cmd, &err); (callee: memprintf(&err, ...)) + * fprintf(stderr, "Parser said: %s\n", indent_error(&err)); + * free(err); + */ +char *indent_msg(char **out, int level); +int append_prefixed_str(struct buffer *out, const char *in, const char *pfx, char eol, int first); + +/* removes environment variable <name> from the environment as found in + * environ. This is only provided as an alternative for systems without + * unsetenv() (old Solaris and AIX versions). THIS IS NOT THREAD SAFE. + * The principle is to scan environ for each occurrence of variable name + * <name> and to replace the matching pointers with the last pointer of + * the array (since variables are not ordered). + * It always returns 0 (success). + */ +int my_unsetenv(const char *name); + +/* Convert occurrences of environment variables in the input string to their + * corresponding value. A variable is identified as a series of alphanumeric + * characters or underscores following a '$' sign. The <in> string must be + * free()able. NULL returns NULL. The resulting string might be reallocated if + * some expansion is made. + */ +char *env_expand(char *in); +uint32_t parse_line(char *in, char *out, size_t *outlen, char **args, int *nbargs, uint32_t opts, const char **errptr); +ssize_t read_line_to_trash(const char *path_fmt, ...); +size_t sanitize_for_printing(char *line, size_t pos, size_t width); +void update_word_fingerprint(uint8_t *fp, const char *word); +void make_word_fingerprint(uint8_t *fp, const char *word); +int word_fingerprint_distance(const uint8_t *fp1, const uint8_t *fp2); + +/* debugging macro to emit messages using write() on fd #-1 so that strace sees + * them. + */ +#define fddebug(msg...) do { char *_m = NULL; memprintf(&_m, ##msg); if (_m) write(-1, _m, strlen(_m)); free(_m); } while (0) + +/* displays a <len> long memory block at <buf>, assuming first byte of <buf> + * has address <baseaddr>. String <pfx> may be placed as a prefix in front of + * each line. It may be NULL if unused. The output is emitted to file <out>. + */ +void debug_hexdump(FILE *out, const char *pfx, const char *buf, unsigned int baseaddr, int len); + +/* this is used to emit call traces when building with TRACE=1 */ +__attribute__((format(printf, 1, 2))) +void calltrace(char *fmt, ...); + +/* same as strstr() but case-insensitive */ +const char *strnistr(const char *str1, int len_str1, const char *str2, int len_str2); + +int strordered(const char *s1, const char *s2, const char *s3); + +/* after increasing a pointer value, it can exceed the first buffer + * size. This function transform the value of <ptr> according with + * the expected position. <chunks> is an array of the one or two + * available chunks. The first value is the start of the first chunk, + * the second value if the end+1 of the first chunks. The third value + * is NULL or the start of the second chunk and the fourth value is + * the end+1 of the second chunk. The function returns 1 if does a + * wrap, else returns 0. + */ +static inline int fix_pointer_if_wrap(const char **chunks, const char **ptr) +{ + if (*ptr < chunks[1]) + return 0; + if (!chunks[2]) + return 0; + *ptr = chunks[2] + ( *ptr - chunks[1] ); + return 1; +} + +unsigned char utf8_next(const char *s, int len, unsigned int *c); + +static inline unsigned char utf8_return_code(unsigned int code) +{ + return code & 0xf0; +} + +static inline unsigned char utf8_return_length(unsigned char code) +{ + return code & 0x0f; +} + +/* returns a 64-bit a timestamp with the finest resolution available. The + * unit is intentionally not specified. It's mostly used to compare dates. + */ +#if defined(__i386__) || defined(__x86_64__) +static inline unsigned long long rdtsc() +{ + unsigned int a, d; + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return a + ((unsigned long long)d << 32); +} +#else +static inline unsigned long long rdtsc() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return tv.tv_sec * 1000000 + tv.tv_usec; +} +#endif + +/* append a copy of string <str> (in a wordlist) at the end of the list <li> + * On failure : return 0 and <err> filled with an error message. + * The caller is responsible for freeing the <err> and <str> copy + * memory area using free() + */ +struct list; +int list_append_word(struct list *li, const char *str, char **err); + +int dump_text(struct buffer *out, const char *buf, int bsize); +int dump_binary(struct buffer *out, const char *buf, int bsize); +int dump_text_line(struct buffer *out, const char *buf, int bsize, int len, + int *line, int ptr); +void dump_addr_and_bytes(struct buffer *buf, const char *pfx, const void *addr, int n); +void dump_hex(struct buffer *out, const char *pfx, const void *buf, int len, int unsafe); +int may_access(const void *ptr); +const void *resolve_sym_name(struct buffer *buf, const char *pfx, const void *addr); +const char *get_exec_path(void); +void *get_sym_curr_addr(const char *name); +void *get_sym_next_addr(const char *name); +int dump_libs(struct buffer *output, int with_addr); + +/* Note that this may result in opening libgcc() on first call, so it may need + * to have been called once before chrooting. + */ +static forceinline int my_backtrace(void **buffer, int max) +{ +#if !defined(USE_BACKTRACE) + return 0; +#elif defined(HA_HAVE_WORKING_BACKTRACE) + return backtrace(buffer, max); +#else + const struct frame { + const struct frame *next; + void *ra; + } *frame; + int count; + + frame = __builtin_frame_address(0); + for (count = 0; count < max && may_access(frame) && may_access(frame->ra);) { + buffer[count++] = frame->ra; + frame = frame->next; + } + return count; +#endif +} + +/* same as realloc() except that ptr is also freed upon failure */ +static inline void *my_realloc2(void *ptr, size_t size) +{ + void *ret; + + ret = realloc(ptr, size); + if (!ret && size) + free(ptr); + return ret; +} + +int parse_dotted_uints(const char *s, unsigned int **nums, size_t *sz); + +/* PRNG */ +void ha_generate_uuid(struct buffer *output); +void ha_random_seed(const unsigned char *seed, size_t len); +void ha_random_jump96(uint32_t dist); +uint64_t ha_random64(void); + +static inline uint32_t ha_random32() +{ + return ha_random64() >> 32; +} + +static inline int32_t ha_random() +{ + return ha_random32() >> 1; +} + +extern THREAD_LOCAL unsigned int statistical_prng_state; + +/* Xorshift RNGs from http://www.jstatsoft.org/v08/i14/paper. + * This has a (2^32)-1 period, only zero is never returned. + */ +static inline unsigned int statistical_prng() +{ + unsigned int x = statistical_prng_state; + + x ^= x << 13; + x ^= x >> 17; + x ^= x << 5; + return statistical_prng_state = x; +} + +/* returns a random number between 0 and <range> - 1 that is evenly distributed + * over the range. + */ +static inline uint statistical_prng_range(uint range) +{ + return mul32hi(statistical_prng(), range ? range - 1 : 0); +} + +/* returns a hash on <bits> bits of pointer <p> that is suitable for being used + * to compute statistic buckets, in that it's fast and reasonably distributed + * thanks to mixing the bits via a multiplication by a prime number and using + * the middle bits on 64-bit platforms or remixing the topmost with lowest ones + * on 32-bit. The distribution is smooth enough for the hash to provide on + * average 1/e non-colliding entries per input, and use on average 1-1/e + * entries total. Thus for example hashing 1024 random valid pointers will + * result on average in ~647 distinct keys, 377 of which are unique. It was + * carefully selected to deliver optimal distribution for low bit counts so + * that hashing on 2,3,4 or 5 bits delivers good results. + */ +static forceinline uint ptr_hash(const void *p, const int bits) +{ + unsigned long long x = (unsigned long)p; + + if (!bits) + return 0; + + x *= 0xacd1be85U; + if (sizeof(long) == 4) + x ^= x >> 32; + else + x >>= 31 - (bits + 1) / 2; + return x & (~0U >> (-bits & 31)); +} + +/* Same as above but works on two pointers. It will return the same values + * if the second pointer is NULL. + */ +static forceinline uint ptr2_hash(const void *p1, const void *p2, const int bits) +{ + unsigned long long x = (unsigned long)p1; + unsigned long long y = (unsigned long)p2; + + if (!bits) + return 0; + + x *= 0xacd1be85U; + y *= 0x9d28e4e9U; + x ^= y; + if (sizeof(long) == 4) + x ^= x >> 32; + else + x >>= 33 - bits / 2; + return x & (~0U >> (-bits & 31)); +} + + +/* Update array <fp> with the character transition <prev> to <curr>. If <prev> + * is zero, it's assumed that <curr> is the first character. If <curr> is zero + * its assumed to mark the end. Both may be zero. <fp> is a 1024-entries array + * indexed as 32*from+to. Positions for 'from' and 'to' are: + * 1..26=letter, 27=digit, 28=other/begin/end. + * Row "from=0" is used to mark the character's presence. Others unused. + */ +static inline void update_char_fingerprint(uint8_t *fp, char prev, char curr) +{ + int from, to; + + switch (prev) { + case 0: from = 28; break; // begin + case 'a'...'z': from = prev - 'a' + 1; break; + case 'A'...'Z': from = tolower(prev) - 'a' + 1; break; + case '0'...'9': from = 27; break; + default: from = 28; break; + } + + switch (curr) { + case 0: to = 28; break; // end + case 'a'...'z': to = curr - 'a' + 1; break; + case 'A'...'Z': to = tolower(curr) - 'a' + 1; break; + case '0'...'9': to = 27; break; + default: to = 28; break; + } + if (curr) + fp[to] = 1; + fp[32 * from + to]++; +} + + +/* compare the current OpenSSL version to a string */ +int openssl_compare_current_version(const char *version); +/* compare the current OpenSSL name to a string */ +int openssl_compare_current_name(const char *name); + +#endif /* _HAPROXY_TOOLS_H */ diff --git a/include/haproxy/trace-t.h b/include/haproxy/trace-t.h new file mode 100644 index 0000000..322fccd --- /dev/null +++ b/include/haproxy/trace-t.h @@ -0,0 +1,179 @@ +/* + * include/haproxy/trace-t.h + * This file provides definitions for runtime tracing + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TRACE_T_H +#define _HAPROXY_TRACE_T_H + +#include <import/ist.h> +#include <haproxy/api-t.h> +#include <haproxy/sink-t.h> + +/* the macros below define an optional type for each of the 4 args passed to + * the trace() call. When such a type is set, the caller commits to exclusively + * using a valid pointer when this argument is not null. This allows the trace() + * function to automatically start or stop the lock-on mechanism when it detects + * a type that it can dereference such as a connection or a stream. Each value + * is represented as an exclusive bit and each arg is represented by a distinct + * byte. The reason for using a single bit per value is to speed up tests using + * bitmasks. Users must not declare args with multiple bits set for the same arg. + * By default arguments are private, corresponding to value 0. + */ + +/* for use only in macro definitions above */ +#define TRC_ARG_PRIV (0) +#define TRC_ARG_CONN (1 << 0) +#define TRC_ARG_SESS (1 << 1) +#define TRC_ARG_STRM (1 << 2) +#define TRC_ARG_CHK (1 << 3) +#define TRC_ARG_QCON (1 << 4) +#define TRC_ARG_APPCTX (1 << 5) + +#define TRC_ARG1_PRIV (TRC_ARG_PRIV << 0) +#define TRC_ARG1_CONN (TRC_ARG_CONN << 0) +#define TRC_ARG1_SESS (TRC_ARG_SESS << 0) +#define TRC_ARG1_STRM (TRC_ARG_STRM << 0) +#define TRC_ARG1_CHK (TRC_ARG_CHK << 0) +#define TRC_ARG1_QCON (TRC_ARG_QCON << 0) +#define TRC_ARG1_APPCTX (TRC_ARG_APPCTX << 0) + +#define TRC_ARG2_PRIV (TRC_ARG_PRIV << 8) +#define TRC_ARG2_CONN (TRC_ARG_CONN << 8) +#define TRC_ARG2_SESS (TRC_ARG_SESS << 8) +#define TRC_ARG2_STRM (TRC_ARG_STRM << 8) +#define TRC_ARG2_CHK (TRC_ARG_CHK << 8) +#define TRC_ARG2_QCON (TRC_ARG_QCON << 8) +#define TRC_ARG2_APPCTX (TRC_ARG_APPCTX << 8) + +#define TRC_ARG3_PRIV (TRC_ARG_PRIV << 16) +#define TRC_ARG3_CONN (TRC_ARG_CONN << 16) +#define TRC_ARG3_SESS (TRC_ARG_SESS << 16) +#define TRC_ARG3_STRM (TRC_ARG_STRM << 16) +#define TRC_ARG3_CHK (TRC_ARG_CHK << 16) +#define TRC_ARG3_QCON (TRC_ARG_QCON << 16) +#define TRC_ARG3_APPCTX (TRC_ARG_APPCTX << 16) + +#define TRC_ARG4_PRIV (TRC_ARG_PRIV << 24) +#define TRC_ARG4_CONN (TRC_ARG_CONN << 24) +#define TRC_ARG4_SESS (TRC_ARG_SESS << 24) +#define TRC_ARG4_STRM (TRC_ARG_STRM << 24) +#define TRC_ARG4_CHK (TRC_ARG_CHK << 24) +#define TRC_ARG4_QCON (TRC_ARG_QCON << 24) +#define TRC_ARG4_APPCTX (TRC_ARG_APPCTX << 24) + +/* usable to detect the presence of any arg of the desired type */ +#define TRC_ARGS_CONN (TRC_ARG_CONN * 0x01010101U) +#define TRC_ARGS_SESS (TRC_ARG_SESS * 0x01010101U) +#define TRC_ARGS_STRM (TRC_ARG_STRM * 0x01010101U) +#define TRC_ARGS_CHK (TRC_ARG_CHK * 0x01010101U) +#define TRC_ARGS_QCON (TRC_ARG_QCON * 0x01010101U) +#define TRC_ARGS_APPCTX (TRC_ARG_APPCTX * 0x01010101U) + + +enum trace_state { + TRACE_STATE_STOPPED = 0, // completely disabled + TRACE_STATE_WAITING, // waiting for the start condition to happen + TRACE_STATE_RUNNING, // waiting for the stop or pause conditions +}; + +/* trace levels, from least detailed to most detailed. Traces emitted at a + * lower level are always reported at higher levels. + */ +enum trace_level { + TRACE_LEVEL_ERROR = 0, // only errors + TRACE_LEVEL_USER, // also info useful to the end user + TRACE_LEVEL_PROTO, // also report protocol-level updates + TRACE_LEVEL_STATE, // also report state changes + TRACE_LEVEL_DATA, // also report data exchanges + TRACE_LEVEL_DEVELOPER, // functions entry/exit and any other developer info +}; + +enum trace_lockon { + TRACE_LOCKON_NOTHING = 0, // don't lock on anything + TRACE_LOCKON_THREAD, // lock on the thread that started the trace + TRACE_LOCKON_LISTENER, // lock on the listener that started the trace + TRACE_LOCKON_FRONTEND, // lock on the frontend that started the trace + TRACE_LOCKON_BACKEND, // lock on the backend that started the trace + TRACE_LOCKON_SERVER, // lock on the server that started the trace + TRACE_LOCKON_CONNECTION, // lock on the connection that started the trace + TRACE_LOCKON_SESSION, // lock on the session that started the trace + TRACE_LOCKON_STREAM, // lock on the stream that started the trace + TRACE_LOCKON_CHECK, // lock on the check that started the trace + TRACE_LOCKON_QCON, // lock on the QUIC connection that started the trace + TRACE_LOCKON_APPCTX, // lock on the appctx that started the trace + TRACE_LOCKON_ARG1, // lock on arg1, totally source-dependent + TRACE_LOCKON_ARG2, // lock on arg2, totally source-dependent + TRACE_LOCKON_ARG3, // lock on arg3, totally source-dependent + TRACE_LOCKON_ARG4, // lock on arg4, totally source-dependent +}; + +/* Each trace event maps a name to a mask in an uint64_t. Multiple bits are + * permitted to have composite events. This is supposed to be stored into an + * array terminated by mask 0 (name and desc are then ignored). Names "now", + * "any" and "none" are reserved by the CLI parser for start/pause/stop + * operations.. + */ +struct trace_event { + uint64_t mask; + const char *name; + const char *desc; +}; + +/* Regarding the verbosity, if <decoding> is not NULL, it must point to a NULL- + * terminated array of name:description, which will define verbosity levels + * implemented by the decoding callback. The verbosity value will default to + * 1. When verbosity levels are defined, levels 1 and above are described by + * these levels. At level zero, the callback is never called. + */ +struct trace_source { + /* source definition */ + const struct ist name; + const char *desc; + const struct trace_event *known_events; + struct list source_link; // element in list of known trace sources + void (*default_cb)(enum trace_level level, uint64_t mask, + const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + uint32_t arg_def; // argument definitions (sum of TRC_ARG{1..4}_*) + const struct name_desc *lockon_args; // must be 4 entries if not NULL + const struct name_desc *decoding; // null-terminated if not NULL + /* trace configuration, adjusted by "trace <module>" on CLI */ + enum trace_lockon lockon; + uint64_t start_events; // what will start the trace. default: 0=nothing + uint64_t pause_events; // what will pause the trace. default: 0=nothing + uint64_t stop_events; // what will stop the trace. default: 0=nothing + uint64_t report_events; // mask of which events need to be reported. + enum trace_level level; // report traces up to this level of info + unsigned int verbosity; // decoder's level of detail among <decoding> (0=no cb) + struct sink *sink; // where to send the trace + /* trace state part below */ + enum trace_state state; + const void *lockon_ptr; // what to lockon when lockon is set +}; + +#endif /* _HAPROXY_TRACE_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/trace.h b/include/haproxy/trace.h new file mode 100644 index 0000000..703ac8d --- /dev/null +++ b/include/haproxy/trace.h @@ -0,0 +1,216 @@ +/* + * include/haproxy/trace.h + * This file provides functions for runtime tracing + * + * Copyright (C) 2000-2019 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_TRACE_H +#define _HAPROXY_TRACE_H + +#include <import/ist.h> +#include <haproxy/api.h> +#include <haproxy/list.h> +#include <haproxy/sink-t.h> +#include <haproxy/tools.h> +#include <haproxy/trace-t.h> + +/* Make a string from the location of the trace producer as "file:line" */ +#define TRC_LOC _TRC_LOC(__FILE__, __LINE__) +#define _TRC_LOC(f,l) __TRC_LOC(f, ":", l) +#define __TRC_LOC(f,c,l) f c #l + +/* truncate a macro arg list to exactly 5 args and replace missing ones with NULL. + * The first one (a0) is always ignored. + */ +#define TRC_5ARGS(a0,a1,a2,a3,a4,a5,...) DEFNULL(a1),DEFNULL(a2),DEFNULL(a3),DEFNULL(a4),DEFNULL(a5) + +/* reports whether trace is active for the source and the arguments. It uses + * the same criteria as trace() (locking, filtering etc) so it's safe to use + * from application code to decide whether or not to engage in heavier data + * preparation processing. + */ +#define _trace_enabled(level, mask, src, args...) \ + (unlikely((src)->state != TRACE_STATE_STOPPED && \ + __trace_enabled(level, mask, src, ##args, NULL) > 0)) + +/* sends a trace for the given source. Arguments are passed in the exact same + * order as in the __trace() function, which is only called if (src)->state is + * not TRACE_STATE_STOPPED. This is the only case where arguments are evaluated. + */ +#define _trace(level, mask, src, args...) \ + do { \ + if (unlikely((src)->state != TRACE_STATE_STOPPED)) \ + __trace(level, mask, src, ##args); \ + } while (0) + +/* For convenience, TRACE() alone uses the file's default TRACE_LEVEL, most + * likely TRACE_LEVEL_DEVELOPER, though the other explicit variants specify + * the desired level and will work when TRACE_LEVEL is not set. The 5 optional + * arguments are the 4 source-specific arguments that are passed to the cb() + * callback dedicated to decoding, and which may be used for special tracking. + * These 4 arguments as well as the cb() function pointer may all be NULL, or + * simply omitted (in which case they will be replaced by a NULL). This + * ordering allows many TRACE() calls to be placed using copy-paste and just + * change the message at the beginning. Only TRACE_DEVEL(), TRACE_ENTER() and + * TRACE_LEAVE() will report the calling function's name. TRACE_PRINTF() does + * require all the optional a1..a4 to be passed (possibly zero) so that they're + * always followed by the format string, then the values to be formatted. + * + * TRACE_* will call the _trace() macro which will test if the trace is enabled + * before calling the __trace() function. _trace() shouldn't be a function (nor + * inline) itself because we don't want the caller to compute its arguments if + * traces are not enabled. + * + * TRACE_ENABLED() reports whether or not trace is enabled for the current + * source, level, mask and arguments. + */ +#define TRACE_ENABLED(level, mask, args...) (_trace_enabled((level), (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, ##args)) + +#define TRACE(msg, mask, args...) \ + _trace(TRACE_LEVEL, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_ERROR(msg, mask, args...) \ + _trace(TRACE_LEVEL_ERROR, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_USER(msg, mask, args...) \ + _trace(TRACE_LEVEL_USER, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_DATA(msg, mask, args...) \ + _trace(TRACE_LEVEL_DATA, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_PROTO(msg, mask, args...) \ + _trace(TRACE_LEVEL_PROTO, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_STATE(msg, mask, args...) \ + _trace(TRACE_LEVEL_STATE, (mask), TRACE_SOURCE, ist(TRC_LOC), NULL, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_DEVEL(msg, mask, args...) \ + _trace(TRACE_LEVEL_DEVELOPER, (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, TRC_5ARGS(0,##args,0,0,0,0,0), ist(msg)) + +#define TRACE_ENTER(mask, args...) \ + _trace(TRACE_LEVEL_DEVELOPER, (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, TRC_5ARGS(0,##args,0,0,0,0,0), ist("entering")) + +#define TRACE_LEAVE(mask, args...) \ + _trace(TRACE_LEVEL_DEVELOPER, (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, TRC_5ARGS(0,##args,0,0,0,0,0), ist("leaving")) + +#define TRACE_POINT(mask, args...) \ + _trace(TRACE_LEVEL_DEVELOPER, (mask), TRACE_SOURCE, ist(TRC_LOC), __FUNCTION__, TRC_5ARGS(0,##args,0,0,0,0,0), ist("in")) + +/* This produces a printf-like trace at level <level> for event mask <mask> and + * trace arguments <a1..a4>. All args mandatory, but may be zero. No output + * callback will be used since we expect the caller to pass a fully formatted + * message that must not be degraded. The output will be truncated to + * TRACE_MAX_MSG-1 bytes (1023 by default). Caller must include <stdio.h> for + * snprintf(). One call will lead to one independent message, which means that + * multiple messages may be interleaved between threads, hence the caller is + * encouraged to prepend a context at the beginning of the format string when + * dumping lists or arrays. The _LOC variation takes the caller's location and + * function name as an ist and a (const char *) respectively, it is meant for + * being called from wrapper function which will work on behalf of a caller. + */ +#define TRACE_PRINTF(level, mask, a1, a2, a3, a4, fmt, args...) \ + TRACE_PRINTF_LOC(level, mask, ist(TRC_LOC), __FUNCTION__, a1, a2, a3, a4, fmt, ##args) + +#define TRACE_PRINTF_LOC(level, mask, trc_loc, func, a1, a2, a3, a4, fmt, args...) \ + do { \ + if (TRACE_ENABLED((level), (mask), a1, a2, a3, a4)) { \ + char _msg[TRACE_MAX_MSG]; \ + size_t _msg_len; \ + _msg_len = snprintf(_msg, sizeof(_msg), (fmt), ##args); \ + if (_msg_len >= sizeof(_msg)) \ + _msg_len = sizeof(_msg) - 1; \ + _trace((level), (mask), TRACE_SOURCE, \ + trc_loc, func, a1, a2, a3, a4, \ + &trace_no_cb, ist2(_msg, _msg_len)); \ + } \ + } while (0) + +#if defined(DEBUG_DEV) || defined(DEBUG_FULL) +# define DBG_TRACE(msg, mask, args...) TRACE(msg, mask, ##args) +# define DBG_TRACE_ERROR(msg, mask, args...) TRACE_ERROR(msg, mask, ##args) +# define DBG_TRACE_USER(msg, mask, args...) TRACE_USER(msg, mask, ##args) +# define DBG_TRACE_DATA(msg, mask, args...) TRACE_DATA(msg, mask, ##args) +# define DBG_TRACE_PROTO(msg, mask, args...) TRACE_PROTO(msg, mask, ##args) +# define DBG_TRACE_STATE(msg, mask, args...) TRACE_STATE(msg, mask, ##args) +# define DBG_TRACE_DEVEL(msg, mask, args...) TRACE_DEVEL(msg, mask, ##args) +# define DBG_TRACE_ENTER(mask, args...) TRACE_ENTER(mask, ##args) +# define DBG_TRACE_LEAVE(mask, args...) TRACE_LEAVE(mask, ##args) +# define DBG_TRACE_POINT(mask, args...) TRACE_POINT(mask, ##args) +# define DBG_TRACE_PRINTF(level, args...) TRACE_PRINTF(level, ##args) +# define DBG_TRACE_PRINTF_LOC(level, args...) TRACE_PRINTF_LOC(level, ##args) +#else +# define DBG_TRACE(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_ERROR(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_USER(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_DATA(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_PROTO(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_STATE(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_DEVEL(msg, mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_ENTER(mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_LEAVE(mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_POINT(mask, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_PRINTF(level, args...) do { /* do nothing */ } while(0) +# define DBG_TRACE_PRINTF_LOC(level, args...) do { /* do nothing */ } while(0) +#endif + +extern struct list trace_sources; +extern THREAD_LOCAL struct buffer trace_buf; + +int __trace_enabled(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + const void **plockptr); + +void __trace(enum trace_level level, uint64_t mask, struct trace_source *src, + const struct ist where, const char *func, + const void *a1, const void *a2, const void *a3, const void *a4, + void (*cb)(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4), + const struct ist msg); + +void trace_no_cb(enum trace_level level, uint64_t mask, const struct trace_source *src, + const struct ist where, const struct ist func, + const void *a1, const void *a2, const void *a3, const void *a4); + +void trace_register_source(struct trace_source *source); + +int trace_parse_cmd(char *arg, char **errmsg); + +/* return a single char to describe a trace state */ +static inline char trace_state_char(enum trace_state st) +{ + return (st == TRACE_STATE_RUNNING) ? 'R' : + (st == TRACE_STATE_WAITING) ? 'w' : + '.'; +} + +/* return a single char to describe an event state */ +static inline char trace_event_char(uint64_t conf, uint64_t ev) +{ + return (conf & ev) ? '+' : '-'; +} + +#endif /* _HAPROXY_TRACE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/uri_auth-t.h b/include/haproxy/uri_auth-t.h new file mode 100644 index 0000000..009adfd --- /dev/null +++ b/include/haproxy/uri_auth-t.h @@ -0,0 +1,56 @@ +/* + * include/haproxy/uri_auth-t.h + * Definitions for URI-based user authentication using the HTTP basic method. + * + * Copyright 2006-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_URI_AUTH_T_H +#define _HAPROXY_URI_AUTH_T_H + +#include <haproxy/acl-t.h> +#include <haproxy/auth-t.h> + +/* This is a list of proxies we are allowed to see. Later, it should go in the + * user list, but before this we need to support de/re-authentication. + */ +struct stat_scope { + struct stat_scope *next; /* next entry, NULL if none */ + int px_len; /* proxy name length */ + char *px_id; /* proxy id */ +}; + +/* later we may link them to support multiple URI matching */ +struct uri_auth { + int uri_len; /* the prefix length */ + char *uri_prefix; /* the prefix we want to match */ + char *auth_realm; /* the realm reported to the client */ + char *node, *desc; /* node name & description reported in this stats */ + int refresh; /* refresh interval for the browser (in seconds) */ + unsigned int flags; /* STAT_* flags from stats.h and for applet.ctx.stats.flags */ + struct stat_scope *scope; /* linked list of authorized proxies */ + struct userlist *userlist; /* private userlist to emulate legacy "stats auth user:password" */ + struct list http_req_rules; /* stats http-request rules : allow/deny/auth */ + struct list admin_rules; /* 'stats admin' rules (chained) */ + struct uri_auth *next; /* Used at deinit() to build a list of unique elements */ +}; + +struct stats_admin_rule { + struct list list; /* list linked to from the proxy */ + struct acl_cond *cond; /* acl condition to meet */ +}; + +#endif /* _HAPROXY_URI_AUTH_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/uri_auth.h b/include/haproxy/uri_auth.h new file mode 100644 index 0000000..27dca02 --- /dev/null +++ b/include/haproxy/uri_auth.h @@ -0,0 +1,44 @@ +/* + * include/haproxy/uri_auth.h + * Functions for URI-based user authentication using the HTTP basic method. + * + * Copyright 2006-2020 Willy Tarreau <w@1wt.eu> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_URI_AUTH_H +#define _HAPROXY_URI_AUTH_H + +#include <haproxy/api.h> +#include <haproxy/uri_auth-t.h> + +/* Various functions used to set the fields during the configuration parsing. + * Please that all those function can initialize the root entry in order not to + * force the user to respect a certain order in the configuration file. + * + * Default values are used during initialization. Check STATS_DEFAULT_* for + * more information. + */ +struct uri_auth *stats_check_init_uri_auth(struct uri_auth **root); +struct uri_auth *stats_set_uri(struct uri_auth **root, char *uri); +struct uri_auth *stats_set_realm(struct uri_auth **root, char *realm); +struct uri_auth *stats_set_refresh(struct uri_auth **root, int interval); +struct uri_auth *stats_set_flag(struct uri_auth **root, int flag); +struct uri_auth *stats_add_auth(struct uri_auth **root, char *user); +struct uri_auth *stats_add_scope(struct uri_auth **root, char *scope); +struct uri_auth *stats_set_node(struct uri_auth **root, char *name); +struct uri_auth *stats_set_desc(struct uri_auth **root, char *desc); + +#endif /* _HAPROXY_URI_AUTH_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/uri_normalizer-t.h b/include/haproxy/uri_normalizer-t.h new file mode 100644 index 0000000..bcbcaef --- /dev/null +++ b/include/haproxy/uri_normalizer-t.h @@ -0,0 +1,31 @@ +/* + * include/haproxy/uri_normalizer.h + * HTTP request URI normalization. + * + * Copyright 2021 Tim Duesterhus <tim@bastelstu.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_URI_NORMALIZER_T_H +#define _HAPROXY_URI_NORMALIZER_T_H + +enum uri_normalizer_err { + URI_NORMALIZER_ERR_NONE = 0, + URI_NORMALIZER_ERR_ALLOC, + URI_NORMALIZER_ERR_INVALID_INPUT, + URI_NORMALIZER_ERR_INTERNAL_ERROR = 0xdead, +}; + +#endif /* _HAPROXY_URI_NORMALIZER_T_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/uri_normalizer.h b/include/haproxy/uri_normalizer.h new file mode 100644 index 0000000..b384007 --- /dev/null +++ b/include/haproxy/uri_normalizer.h @@ -0,0 +1,44 @@ +/* + * include/haproxy/uri_normalizer.h + * HTTP request URI normalization. + * + * Copyright 2021 Tim Duesterhus <tim@bastelstu.be> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#ifndef _HAPROXY_URI_NORMALIZER_H +#define _HAPROXY_URI_NORMALIZER_H + +#include <import/ist.h> + +#include <haproxy/uri_normalizer-t.h> + +/* Cuts the input at the first '#'. */ +static inline enum uri_normalizer_err uri_normalizer_fragment_strip(const struct ist input, struct ist *dst) +{ + *dst = iststop(input, '#'); + + return URI_NORMALIZER_ERR_NONE; +} + +enum uri_normalizer_err uri_normalizer_fragment_encode(const struct ist input, struct ist *dst); +enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct ist input, int strict, struct ist *dst); +enum uri_normalizer_err uri_normalizer_percent_upper(const struct ist input, int strict, struct ist *dst); +enum uri_normalizer_err uri_normalizer_path_dot(const struct ist path, struct ist *dst); +enum uri_normalizer_err uri_normalizer_path_dotdot(const struct ist path, int full, struct ist *dst); +enum uri_normalizer_err uri_normalizer_path_merge_slashes(const struct ist path, struct ist *dst); +enum uri_normalizer_err uri_normalizer_query_sort(const struct ist query, const char delim, struct ist *dst); + +#endif /* _HAPROXY_URI_NORMALIZER_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/haproxy/vars-t.h b/include/haproxy/vars-t.h new file mode 100644 index 0000000..e239b1c --- /dev/null +++ b/include/haproxy/vars-t.h @@ -0,0 +1,71 @@ +/* + * include/haproxy/vars-t.h + * Macros and structures definitions for variables. + * + * Copyright (C) 2015 Thierry FOURNIER <tfournier@arpalert.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_VARS_T_H +#define _HAPROXY_VARS_T_H + +#include <haproxy/sample_data-t.h> +#include <haproxy/thread-t.h> + +/* flags used when setting/clearing variables */ +#define VF_CREATEONLY 0x00000001 // do nothing if the variable already exists +#define VF_PERMANENT 0x00000002 // variables known to the config parser + +#define VF_COND_IFEXISTS 0x00000004 // only set variable if it already exists +#define VF_COND_IFNOTEXISTS 0x00000008 // only set variable if it did not exist yet +#define VF_COND_IFEMPTY 0x00000010 // only set variable if sample is empty +#define VF_COND_IFNOTEMPTY 0x00000020 // only set variable if sample is not empty +#define VF_COND_IFSET 0x00000040 // only set variable if its type is not SMP_TYPE_ANY +#define VF_COND_IFNOTSET 0x00000080 // only set variable if its type is ANY +#define VF_COND_IFGT 0x00000100 // only set variable if its value is greater than the sample's +#define VF_COND_IFLT 0x00000200 // only set variable if its value is less than the sample's + +enum vars_scope { + SCOPE_SESS = 0, + SCOPE_TXN, + SCOPE_REQ, + SCOPE_RES, + SCOPE_PROC, + SCOPE_CHECK, +}; + +struct vars { + struct list head; + enum vars_scope scope; + unsigned int size; + __decl_thread(HA_RWLOCK_T rwlock); +}; + +/* This struct describes a variable as found in an arg_data */ +struct var_desc { + uint64_t name_hash; + enum vars_scope scope; +}; + +struct var { + struct list l; /* Used for chaining vars. */ + uint64_t name_hash; /* XXH3() of the variable's name */ + uint flags; // VF_* + /* 32-bit hole here */ + struct sample_data data; /* data storage. */ +}; + +#endif diff --git a/include/haproxy/vars.h b/include/haproxy/vars.h new file mode 100644 index 0000000..ebd1f15 --- /dev/null +++ b/include/haproxy/vars.h @@ -0,0 +1,72 @@ +/* + * include/haproxy/vars.h + * Prototypes for variables. + * + * Copyright (C) 2015 Thierry FOURNIER <tfournier@arpalert.org> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_VARS_H +#define _HAPROXY_VARS_H + +#include <haproxy/api-t.h> +#include <haproxy/session-t.h> +#include <haproxy/stream-t.h> +#include <haproxy/vars-t.h> + +extern struct vars proc_vars; + +void vars_init_head(struct vars *vars, enum vars_scope scope); +void var_accounting_diff(struct vars *vars, struct session *sess, struct stream *strm, int size); +unsigned int var_clear(struct var *var, int force); +void vars_prune(struct vars *vars, struct session *sess, struct stream *strm); +void vars_prune_per_sess(struct vars *vars); +int vars_get_by_name(const char *name, size_t len, struct sample *smp, const struct buffer *def); +int vars_set_by_name_ifexist(const char *name, size_t len, struct sample *smp); +int vars_set_by_name(const char *name, size_t len, struct sample *smp); +int vars_unset_by_name_ifexist(const char *name, size_t len, struct sample *smp); +int vars_get_by_desc(const struct var_desc *var_desc, struct sample *smp, const struct buffer *def); +int vars_check_arg(struct arg *arg, char **err); + +/* locks the <vars> for writes if it's in a shared scope */ +static inline void vars_wrlock(struct vars *vars) +{ + if (vars->scope == SCOPE_PROC) + HA_RWLOCK_WRLOCK(VARS_LOCK, &vars->rwlock); +} + +/* unlocks the <vars> for writes if it's in a shared scope */ +static inline void vars_wrunlock(struct vars *vars) +{ + if (vars->scope == SCOPE_PROC) + HA_RWLOCK_WRUNLOCK(VARS_LOCK, &vars->rwlock); +} + +/* locks the <vars> for reads if it's in a shared scope */ +static inline void vars_rdlock(struct vars *vars) +{ + if (vars->scope == SCOPE_PROC) + HA_RWLOCK_RDLOCK(VARS_LOCK, &vars->rwlock); +} + +/* unlocks the <vars> for reads if it's in a shared scope */ +static inline void vars_rdunlock(struct vars *vars) +{ + if (vars->scope == SCOPE_PROC) + HA_RWLOCK_RDUNLOCK(VARS_LOCK, &vars->rwlock); +} + +#endif diff --git a/include/haproxy/version.h b/include/haproxy/version.h new file mode 100644 index 0000000..651a8de --- /dev/null +++ b/include/haproxy/version.h @@ -0,0 +1,86 @@ +/* + * include/haproxy/version.h + * This file serves as a template for future include files. + * + * Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _HAPROXY_VERSION_H +#define _HAPROXY_VERSION_H + +#include <haproxy/api.h> + +#ifdef CONFIG_PRODUCT_NAME +#define PRODUCT_NAME CONFIG_PRODUCT_NAME +#else +#define PRODUCT_NAME "HAProxy" +#endif + +#ifdef CONFIG_PRODUCT_BRANCH +#define PRODUCT_BRANCH CONFIG_PRODUCT_BRANCH +#else +#define PRODUCT_BRANCH "2.9" +#endif + +#ifdef CONFIG_PRODUCT_STATUS +#define PRODUCT_STATUS CONFIG_PRODUCT_STATUS +#else +#define PRODUCT_STATUS "Status: stable branch - will stop receiving fixes around Q1 2025." +#endif + +#ifdef CONFIG_PRODUCT_URL_BUGS +#define PRODUCT_URL_BUGS CONFIG_PRODUCT_URL_BUGS +#else +#define PRODUCT_URL_BUGS "http://www.haproxy.org/bugs/bugs-%s.html" +#endif + +#ifdef CONFIG_PRODUCT_URL +#define PRODUCT_URL CONFIG_PRODUCT_URL +#else +#define PRODUCT_URL "http://www.haproxy.org/" +#endif + +#ifdef CONFIG_PRODUCT_URL_UPD +#define PRODUCT_URL_UPD CONFIG_PRODUCT_URL_UPD +#else +#define PRODUCT_URL_UPD "http://www.haproxy.org/#down" +#endif + +#ifdef CONFIG_PRODUCT_URL_DOC +#define PRODUCT_URL_DOC CONFIG_PRODUCT_URL_DOC +#else +#define PRODUCT_URL_DOC "http://www.haproxy.org/#docs" +#endif + +#ifdef CONFIG_HAPROXY_VERSION +#define HAPROXY_VERSION CONFIG_HAPROXY_VERSION +#else +#error "Must define CONFIG_HAPROXY_VERSION" +#endif + +#ifdef CONFIG_HAPROXY_DATE +#define HAPROXY_DATE CONFIG_HAPROXY_DATE +#else +#error "Must define CONFIG_HAPROXY_DATE" +#endif + +extern char haproxy_version[]; +extern char haproxy_date[]; +extern char stats_version_string[]; + +#endif /* _HAPROXY_VERSION_H */ + diff --git a/include/haproxy/xref-t.h b/include/haproxy/xref-t.h new file mode 100644 index 0000000..a2aed54 --- /dev/null +++ b/include/haproxy/xref-t.h @@ -0,0 +1,45 @@ +/* + * include/haproxy/xref-t.h + * Atomic cross-references between two elements - types + * + * Copyright (C) 2017 Thierry Fournier <thierry.fournier@ozon.io> + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef __HAPROXY_XREF_T_H__ +#define __HAPROXY_XREF_T_H__ + +/* xref is used to create relation between two elements. + * Once an element is released, it breaks the relation. If the + * relation is already broken, it frees the xref struct. + * The pointer between two elements is sort of a refcount with + * max value 1. The relation is only between two elements. + * The pointer and the type of elements a and b are conventional. + */ + +#define XREF_BUSY ((struct xref *)1) + +struct xref { + struct xref *peer; +}; + +#endif /* __HAPROXY_XREF_T_H__ */ diff --git a/include/haproxy/xref.h b/include/haproxy/xref.h new file mode 100644 index 0000000..42eed58 --- /dev/null +++ b/include/haproxy/xref.h @@ -0,0 +1,105 @@ +/* + * include/haproxy/xref.h + * Atomic cross-references between two elements - functions + * + * Copyright (C) 2017 Thierry Fournier <thierry.fournier@ozon.io> + * Copyright (C) 2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef __HAPROXY_XREF_H__ +#define __HAPROXY_XREF_H__ + +#include <haproxy/xref-t.h> + +/* xref is used to create relation between two elements. + * Once an element is released, it breaks the relation. If the + * relation is already broken, it frees the xref struct. + * The pointer between two elements is sort of a refcount with + * max value 1. The relation is only between two elements. + * The pointer and the type of elements a and b are conventional. + */ + +static inline void xref_create(struct xref *xref_a, struct xref *xref_b) +{ + xref_a->peer = xref_b; + xref_b->peer = xref_a; +} + +static inline struct xref *xref_get_peer_and_lock(struct xref *xref) +{ + struct xref *local; + struct xref *remote; + + while (1) { + + /* Get the local pointer to the peer. */ + local = _HA_ATOMIC_XCHG(&xref->peer, XREF_BUSY); + __ha_barrier_atomic_store(); + + /* If the local pointer is NULL, the peer no longer exists. */ + if (local == NULL) { + xref->peer = NULL; + return NULL; + } + + /* If the local pointeru is BUSY, the peer try to acquire the + * lock. We retry the process. + */ + if (local == XREF_BUSY) + continue; + + /* We are locked, the peer can't disappear, try to acquire + * the pper's lock. Note that remote can't be NULL. + */ + remote = _HA_ATOMIC_XCHG(&local->peer, XREF_BUSY); + + /* The remote lock is BUSY, We retry the process. */ + if (remote == XREF_BUSY) { + xref->peer = local; + __ha_barrier_store(); + continue; + } + + /* We have the lock, we return the value of the xref. */ + return local; + } +} + +static inline void xref_unlock(struct xref *xref, struct xref *peer) +{ + /* Release the peer. */ + peer->peer = xref; + + __ha_barrier_store(); + + /* Release myself. */ + xref->peer = peer; +} + +static inline void xref_disconnect(struct xref *xref, struct xref *peer) +{ + peer->peer = NULL; + __ha_barrier_store(); + xref->peer = NULL; +} + +#endif /* __HAPROXY_XREF_H__ */ diff --git a/include/haproxy/xxhash.h b/include/haproxy/xxhash.h new file mode 100644 index 0000000..cd333e6 --- /dev/null +++ b/include/haproxy/xxhash.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2020 Dragan Dosen <ddosen@haproxy.com> + * Copyright (C) 2021 Tim Duesterhus <tim@bastelstu.be> + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef _HAPROXY_XXHASH_H +#define _HAPROXY_XXHASH_H + +/* Make all xxhash functions inline, with implementations being directly + * included within xxhash.h. + */ +#ifndef XXH_INLINE_ALL +#define XXH_INLINE_ALL +#else +#error "XXH_INLINE_ALL is already defined." +#endif + +#include <import/xxhash.h> + +/* Make the new complex name for the xxhash function easier to remember + * and use. + */ +#ifndef XXH3 +#define XXH3(data, len, seed) XXH3_64bits_withSeed(data, len, seed) +#endif + +#endif diff --git a/include/import/atomic-ops.h b/include/import/atomic-ops.h new file mode 100644 index 0000000..29674db --- /dev/null +++ b/include/import/atomic-ops.h @@ -0,0 +1,1991 @@ +/* generic atomic operations used by progressive locks + * + * Copyright (C) 2012-2022 Willy Tarreau <w@1wt.eu> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef PL_ATOMIC_OPS_H +#define PL_ATOMIC_OPS_H + +/* The definitions below exist in two forms: + * - fallback form (_pl_*) + * - preferred form (pl_*) + * + * As a general rule, given that C11 atomics tend to offer more flexibility to + * the compiler, these should set the preferred form, and the arch-specific + * code should set the fallback code. But it's possible for arch-specific code + * to set a preferred form, in which case it will simply be used over the other + * ones. + */ + +/* + * Architecture-specific versions of the various operations + */ + +/* + * ###### ix86 / x86_64 below ###### + */ +#if defined(__i386__) || defined (__i486__) || defined (__i586__) || defined (__i686__) || defined (__x86_64__) + +/* for compilers supporting condition flags on output, let's directly return them */ +#if defined(__GCC_ASM_FLAG_OUTPUTS__) +#define X86_COND_C_TO_REG(reg) "" +#define X86_COND_Z_TO_REG(reg) "" +#define X86_COND_NZ_TO_REG(reg) "" +#define X86_COND_C_RESULT(var) "=@ccc"(var) +#define X86_COND_Z_RESULT(var) "=@ccz"(var) +#define X86_COND_NZ_RESULT(var) "=@ccnz"(var) +#else +#define X86_COND_C_TO_REG(reg) "sbb %" #reg ", %" #reg "\n\t" +#define X86_COND_Z_TO_REG(reg) "sete %" #reg "\n\t" +#define X86_COND_NZ_TO_REG(reg) "setne %" #reg "\n\t" +#define X86_COND_C_RESULT(var) "=r"(var) +#define X86_COND_Z_RESULT(var) "=qm"(var) +#define X86_COND_NZ_RESULT(var) "=qm"(var) +#endif + +/* CPU relaxation while waiting (PAUSE instruction on x86) */ +#define pl_cpu_relax() do { \ + asm volatile("rep;nop\n"); \ + } while (0) + +/* full memory barrier using mfence when SSE2 is supported, falling back to + * "lock add %esp" (gcc uses "lock add" or "lock or"). + */ +#if defined(__SSE2__) + +#define _pl_mb() do { \ + asm volatile("mfence" ::: "memory"); \ + } while (0) + +#elif defined(__x86_64__) + +#define _pl_mb() do { \ + asm volatile("lock addl $0,0 (%%rsp)" ::: "memory", "cc"); \ + } while (0) + +#else /* ix86 */ + +#define _pl_mb() do { \ + asm volatile("lock addl $0,0 (%%esp)" ::: "memory", "cc"); \ + } while (0) + +#endif /* end of pl_mb() case for sse2/x86_64/x86 */ + +/* load/store barriers are nops on x86 */ +#define _pl_mb_load() do { asm volatile("" ::: "memory"); } while (0) +#define _pl_mb_store() do { asm volatile("" ::: "memory"); } while (0) + +/* atomic full/load/store are also nops on x86 */ +#define _pl_mb_ato() do { asm volatile("" ::: "memory"); } while (0) +#define _pl_mb_ato_load() do { asm volatile("" ::: "memory"); } while (0) +#define _pl_mb_ato_store() do { asm volatile("" ::: "memory"); } while (0) + +/* atomic load: on x86 it's just a volatile read */ +#define _pl_load_lax(ptr) _pl_load(ptr) +#define _pl_load(ptr) ({ typeof(*(ptr)) __ptr = *(volatile typeof(ptr))ptr; __ptr; }) + +/* atomic store: on x86 it's just a volatile write */ +#define _pl_store_lax(ptr) _pl_store(ptr) +#define _pl_store(ptr, x) do { *((volatile typeof(ptr))(ptr)) = (typeof(*ptr))(x); } while (0) + +/* increment integer value pointed to by pointer <ptr>, and return non-zero if + * result is non-null. + */ +#define _pl_inc_lax(ptr) _pl_inc(ptr) +#define _pl_inc_acq(ptr) _pl_inc(ptr) +#define _pl_inc_rel(ptr) _pl_inc(ptr) +#define _pl_inc(ptr) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned char ret; \ + asm volatile("lock incq %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned char ret; \ + asm volatile("lock incl %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned char ret; \ + asm volatile("lock incw %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret; \ + asm volatile("lock incb %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_inc__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_inc__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* decrement integer value pointed to by pointer <ptr>, and return non-zero if + * result is non-null. + */ +#define _pl_dec_lax(ptr) _pl_dec(ptr) +#define _pl_dec_acq(ptr) _pl_dec(ptr) +#define _pl_dec_rel(ptr) _pl_dec(ptr) +#define _pl_dec(ptr) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned char ret; \ + asm volatile("lock decq %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned char ret; \ + asm volatile("lock decl %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned char ret; \ + asm volatile("lock decw %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret; \ + asm volatile("lock decb %0\n" \ + X86_COND_NZ_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_NZ_RESULT(ret) \ + : \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_dec__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_dec__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* increment integer value pointed to by pointer <ptr>, no return */ +#define pl_inc_noret_lax(ptr) pl_inc_noret(ptr) +#define pl_inc_noret_acq(ptr) pl_inc_noret(ptr) +#define pl_inc_noret_rel(ptr) pl_inc_noret(ptr) +#define pl_inc_noret(ptr) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock incq %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock incl %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock incw %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock incb %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_inc_noret__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_inc_noret__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* decrement integer value pointed to by pointer <ptr>, no return */ +#define pl_dec_noret_lax(ptr) pl_dec_noret(ptr) +#define pl_dec_noret_acq(ptr) pl_dec_noret(ptr) +#define pl_dec_noret_rel(ptr) pl_dec_noret(ptr) +#define pl_dec_noret(ptr) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock decq %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock decl %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock decw %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock decb %0\n" \ + : "+m" (*(ptr)) \ + : \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_dec_noret__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_dec_noret__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* add integer constant <x> to integer value pointed to by pointer <ptr>, + * no return. Size of <x> is not checked. + */ +#define _pl_add_noret_lax(ptr, x) _pl_add_noret(ptr, x) +#define _pl_add_noret_acq(ptr, x) _pl_add_noret(ptr, x) +#define _pl_add_noret_rel(ptr, x) _pl_add_noret(ptr, x) +#define _pl_add_noret(ptr, x) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock addq %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned long)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock addl %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned int)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock addw %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned short)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock addb %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned char)(x)) \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_add__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_add__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* subtract integer constant <x> from integer value pointed to by pointer + * <ptr>, no return. Size of <x> is not checked. + */ +#define _pl_sub_noret_lax(ptr, x) _pl_sub_noret(ptr, x) +#define _pl_sub_noret_acq(ptr, x) _pl_sub_noret(ptr, x) +#define _pl_sub_noret_rel(ptr, x) _pl_sub_noret(ptr, x) +#define _pl_sub_noret(ptr, x) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock subq %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned long)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock subl %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned int)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock subw %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned short)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock subb %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned char)(x)) \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_sub__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_sub__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* binary and integer value pointed to by pointer <ptr> with constant <x>, no + * return. Size of <x> is not checked. + */ +#define _pl_and_noret_lax(ptr, x) _pl_and_noret(ptr, x) +#define _pl_and_noret_acq(ptr, x) _pl_and_noret(ptr, x) +#define _pl_and_noret_rel(ptr, x) _pl_and_noret(ptr, x) +#define _pl_and_noret(ptr, x) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock andq %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned long)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock andl %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned int)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock andw %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned short)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock andb %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned char)(x)) \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_and__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_and__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* binary or integer value pointed to by pointer <ptr> with constant <x>, no + * return. Size of <x> is not checked. + */ +#define _pl_or_noret_lax(ptr, x) _pl_or_noret(ptr, x) +#define _pl_or_noret_acq(ptr, x) _pl_or_noret(ptr, x) +#define _pl_or_noret_rel(ptr, x) _pl_or_noret(ptr, x) +#define _pl_or_noret(ptr, x) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock orq %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned long)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock orl %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned int)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock orw %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned short)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock orb %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned char)(x)) \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_or__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_or__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* binary xor integer value pointed to by pointer <ptr> with constant <x>, no + * return. Size of <x> is not checked. + */ +#define _pl_xor_noret_lax(ptr, x) _pl_xor_noret(ptr, x) +#define _pl_xor_noret_acq(ptr, x) _pl_xor_noret(ptr, x) +#define _pl_xor_noret_rel(ptr, x) _pl_xor_noret(ptr, x) +#define _pl_xor_noret(ptr, x) do { \ + if (sizeof(long) == 8 && sizeof(*(ptr)) == 8) { \ + asm volatile("lock xorq %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned long)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 4) { \ + asm volatile("lock xorl %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned int)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 2) { \ + asm volatile("lock xorw %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned short)(x)) \ + : "cc"); \ + } else if (sizeof(*(ptr)) == 1) { \ + asm volatile("lock xorb %1, %0\n" \ + : "+m" (*(ptr)) \ + : "er" ((unsigned char)(x)) \ + : "cc"); \ + } else { \ + void __unsupported_argument_size_for_pl_xor__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_xor__(__FILE__,__LINE__); \ + } \ +} while (0) + +/* test and reset bit <bit> in integer value pointed to by pointer <ptr>. Returns + * 0 if the bit was not set, or ~0 of the same type as *ptr if it was set. Note + * that there is no 8-bit equivalent operation. + */ +#define pl_btr_lax(ptr, bit) pl_btr(ptr, bit) +#define pl_btr_acq(ptr, bit) pl_btr(ptr, bit) +#define pl_btr_rel(ptr, bit) pl_btr(ptr, bit) +#define pl_btr(ptr, bit) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret; \ + asm volatile("lock btrq %2, %0\n\t" \ + X86_COND_C_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_C_RESULT(ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret; \ + asm volatile("lock btrl %2, %0\n\t" \ + X86_COND_C_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_C_RESULT(ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret; \ + asm volatile("lock btrw %2, %0\n\t" \ + X86_COND_C_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_C_RESULT(ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_btr__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_btr__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* test and set bit <bit> in integer value pointed to by pointer <ptr>. Returns + * 0 if the bit was not set, or ~0 of the same type as *ptr if it was set. Note + * that there is no 8-bit equivalent operation. + */ +#define pl_bts_lax(ptr, bit) pl_bts(ptr, bit) +#define pl_bts_acq(ptr, bit) pl_bts(ptr, bit) +#define pl_bts_rel(ptr, bit) pl_bts(ptr, bit) +#define pl_bts(ptr, bit) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret; \ + asm volatile("lock btsq %2, %0\n\t" \ + X86_COND_C_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_C_RESULT(ret) \ + : "Ir" ((unsigned long)(bit)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret; \ + asm volatile("lock btsl %2, %0\n\t" \ + X86_COND_C_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_C_RESULT(ret) \ + : "Ir" ((unsigned int)(bit)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret; \ + asm volatile("lock btsw %2, %0\n\t" \ + X86_COND_C_TO_REG(1) \ + : "+m" (*(ptr)), X86_COND_C_RESULT(ret) \ + : "Ir" ((unsigned short)(bit)) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_bts__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_bts__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* Note: for an unclear reason, gcc's __sync_fetch_and_add() implementation + * produces less optimal than hand-crafted asm code so let's implement here the + * operations we need for the most common archs. + */ + +/* fetch-and-add: fetch integer value pointed to by pointer <ptr>, add <x> to + * to <*ptr> and return the previous value. + * => THIS IS LEGACY, USE _pl_ldadd() INSTEAD. + */ +#define _pl_xadd(ptr, x) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret = (unsigned long)(x); \ + asm volatile("lock xaddq %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret = (unsigned int)(x); \ + asm volatile("lock xaddl %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret = (unsigned short)(x); \ + asm volatile("lock xaddw %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret = (unsigned char)(x); \ + asm volatile("lock xaddb %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_xadd__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_xadd__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* fetch-and-add: fetch integer value pointed to by pointer <ptr>, add <x> to + * to <*ptr> and return the previous value. + */ +#define _pl_ldadd_lax(ptr, x) _pl_ldadd(ptr, x) +#define _pl_ldadd_acq(ptr, x) _pl_ldadd(ptr, x) +#define _pl_ldadd_rel(ptr, x) _pl_ldadd(ptr, x) +#define _pl_ldadd(ptr, x) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret = (unsigned long)(x); \ + asm volatile("lock xaddq %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret = (unsigned int)(x); \ + asm volatile("lock xaddl %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret = (unsigned short)(x); \ + asm volatile("lock xaddw %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret = (unsigned char)(x); \ + asm volatile("lock xaddb %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_ldadd__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_ldadd__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* fetch-and-sub: fetch integer value pointed to by pointer <ptr>, add -<x> to + * to <*ptr> and return the previous value. + */ +#define _pl_ldsub_lax(ptr, x) _pl_ldsub(ptr, x) +#define _pl_ldsub_acq(ptr, x) _pl_ldsub(ptr, x) +#define _pl_ldsub_rel(ptr, x) _pl_ldsub(ptr, x) +#define _pl_ldsub(ptr, x) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret = (unsigned long)(-x); \ + asm volatile("lock xaddq %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret = (unsigned int)(-x); \ + asm volatile("lock xaddl %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret = (unsigned short)(-x); \ + asm volatile("lock xaddw %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret = (unsigned char)(-x); \ + asm volatile("lock xaddb %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_ldsub__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_ldsub__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* exchange value <x> with integer value pointed to by pointer <ptr>, and return + * previous <*ptr> value. <x> must be of the same size as <*ptr>. + */ +#define _pl_xchg(ptr, x) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret = (unsigned long)(x); \ + asm volatile("xchgq %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret = (unsigned int)(x); \ + asm volatile("xchgl %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret = (unsigned short)(x); \ + asm volatile("xchgw %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret = (unsigned char)(x); \ + asm volatile("xchgb %0, %1\n" \ + : "=r" (ret), "+m" (*(ptr)) \ + : "0" (ret) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_xchg__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_xchg__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* compare integer value <*ptr> with <old> and exchange it with <new> if + * it matches, and return <old>. <old> and <new> must be of the same size as + * <*ptr>. + */ +#define _pl_cmpxchg(ptr, old, new) ( \ + (sizeof(long) == 8 && sizeof(*(ptr)) == 8) ? ({ \ + unsigned long ret; \ + asm volatile("lock cmpxchgq %2,%1" \ + : "=a" (ret), "+m" (*(ptr)) \ + : "r" ((unsigned long)(new)), \ + "0" ((unsigned long)(old)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 4) ? ({ \ + unsigned int ret; \ + asm volatile("lock cmpxchgl %2,%1" \ + : "=a" (ret), "+m" (*(ptr)) \ + : "r" ((unsigned int)(new)), \ + "0" ((unsigned int)(old)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 2) ? ({ \ + unsigned short ret; \ + asm volatile("lock cmpxchgw %2,%1" \ + : "=a" (ret), "+m" (*(ptr)) \ + : "r" ((unsigned short)(new)), \ + "0" ((unsigned short)(old)) \ + : "cc"); \ + ret; /* return value */ \ + }) : (sizeof(*(ptr)) == 1) ? ({ \ + unsigned char ret; \ + asm volatile("lock cmpxchgb %2,%1" \ + : "=a" (ret), "+m" (*(ptr)) \ + : "r" ((unsigned char)(new)), \ + "0" ((unsigned char)(old)) \ + : "cc"); \ + ret; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_cmpxchg__(char *,int); \ + if (sizeof(*(ptr)) != 1 && sizeof(*(ptr)) != 2 && \ + sizeof(*(ptr)) != 4 && (sizeof(long) != 8 || sizeof(*(ptr)) != 8)) \ + __unsupported_argument_size_for_pl_cmpxchg__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* + * ##### ARM64 (aarch64) below ##### + */ +#elif defined(__aarch64__) + +/* This was shown to improve fairness on modern ARMv8 such as Neoverse N1 */ +#define pl_cpu_relax() do { \ + asm volatile("isb" ::: "memory"); \ + } while (0) + +/* full/load/store barriers */ +#define _pl_mb() do { asm volatile("dmb ish" ::: "memory"); } while (0) +#define _pl_mb_load() do { asm volatile("dmb ishld" ::: "memory"); } while (0) +#define _pl_mb_store() do { asm volatile("dmb ishst" ::: "memory"); } while (0) + +/* atomic full/load/store */ +#define _pl_mb_ato() do { asm volatile("dmb ish" ::: "memory"); } while (0) +#define _pl_mb_ato_load() do { asm volatile("dmb ishld" ::: "memory"); } while (0) +#define _pl_mb_ato_store() do { asm volatile("dmb ishst" ::: "memory"); } while (0) + +#endif // end of arch-specific code + + +/* + * Generic code using the C11 __atomic API for functions not defined above. + * These are usable from gcc-4.7 and clang. We'll simply rely on the macros + * defining the memory orders to detect them. All operations are not + * necessarily defined, so some fallbacks to the default methods might still + * be necessary. + */ + + +#if defined(__ATOMIC_RELAXED) && defined(__ATOMIC_CONSUME) && defined(__ATOMIC_ACQUIRE) && \ + defined(__ATOMIC_RELEASE) && defined(__ATOMIC_ACQ_REL) && defined(__ATOMIC_SEQ_CST) + +/* compiler-only memory barrier, for use around locks */ +#ifndef pl_barrier +#define pl_barrier() __atomic_signal_fence(__ATOMIC_SEQ_CST) +#endif + +/* full memory barrier */ +#ifndef pl_mb +#define pl_mb() __atomic_thread_fence(__ATOMIC_SEQ_CST) +#endif + +/* atomic load */ +#ifndef pl_load_lax +#define pl_load_lax(ptr) __atomic_load_n(ptr, __ATOMIC_RELAXED) +#endif + +#ifndef pl_load +#define pl_load(ptr) __atomic_load_n(ptr, __ATOMIC_ACQUIRE) +#endif + +/* atomic store */ +#ifndef pl_store_lax +#define pl_store_lax(ptr, x) __atomic_store_n((ptr), (x), __ATOMIC_RELAXED) +#endif + +#ifndef pl_store +#define pl_store(ptr, x) __atomic_store_n((ptr), (x), __ATOMIC_RELEASE) +#endif + +/* increment integer value pointed to by pointer <ptr>, and return non-zero if + * result is non-null. + */ +#ifndef pl_inc_lax +#define pl_inc_lax(ptr) (__atomic_add_fetch((ptr), 1, __ATOMIC_RELAXED) != 0) +#endif + +#ifndef pl_inc_acq +#define pl_inc_acq(ptr) (__atomic_add_fetch((ptr), 1, __ATOMIC_ACQUIRE) != 0) +#endif + +#ifndef pl_inc_rel +#define pl_inc_rel(ptr) (__atomic_add_fetch((ptr), 1, __ATOMIC_RELEASE) != 0) +#endif + +#ifndef pl_inc +#define pl_inc(ptr) (__atomic_add_fetch((ptr), 1, __ATOMIC_SEQ_CST) != 0) +#endif + +/* decrement integer value pointed to by pointer <ptr>, and return non-zero if + * result is non-null. + */ +#ifndef pl_dec_lax +#define pl_dec_lax(ptr) (__atomic_sub_fetch((ptr), 1, __ATOMIC_RELAXED) != 0) +#endif + +#ifndef pl_dec_acq +#define pl_dec_acq(ptr) (__atomic_sub_fetch((ptr), 1, __ATOMIC_ACQUIRE) != 0) +#endif + +#ifndef pl_dec_rel +#define pl_dec_rel(ptr) (__atomic_sub_fetch((ptr), 1, __ATOMIC_RELEASE) != 0) +#endif + +#ifndef pl_dec +#define pl_dec(ptr) (__atomic_sub_fetch((ptr), 1, __ATOMIC_SEQ_CST) != 0) +#endif + +/* increment integer value pointed to by pointer <ptr>, no return */ +#ifndef pl_inc_noret_lax +#define pl_inc_noret_lax(ptr) ((void)__atomic_add_fetch((ptr), 1, __ATOMIC_RELAXED)) +#endif + +#ifndef pl_inc_noret_acq +#define pl_inc_noret_acq(ptr) ((void)__atomic_add_fetch((ptr), 1, __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_inc_noret_rel +#define pl_inc_noret_relc(ptr) ((void)__atomic_add_fetch((ptr), 1, __ATOMIC_RELEASE)) +#endif + +#ifndef pl_inc_noret +#define pl_inc_noret(ptr) ((void)__atomic_add_fetch((ptr), 1, __ATOMIC_SEQ_CST)) +#endif + +/* decrement integer value pointed to by pointer <ptr>, no return */ +#ifndef pl_dec_noret_lax +#define pl_dec_noret_lax(ptr) ((void)__atomic_sub_fetch((ptr), 1, __ATOMIC_RELAXED)) +#endif + +#ifndef pl_dec_noret_acq +#define pl_dec_noret_acq(ptr) ((void)__atomic_sub_fetch((ptr), 1, __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_dec_noret_rel +#define pl_dec_noret_relc(ptr) ((void)__atomic_sub_fetch((ptr), 1, __ATOMIC_RELEASE)) +#endif + +#ifndef pl_dec_noret +#define pl_dec_noret(ptr) ((void)__atomic_sub_fetch((ptr), 1, __ATOMIC_SEQ_CST)) +#endif + +/* add integer constant <x> to integer value pointed to by pointer <ptr>, + * no return. Size of <x> is not checked. + */ +#ifndef pl_add_lax +#define pl_add_lax(ptr, x) (__atomic_add_fetch((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_add_acq +#define pl_add_acq(ptr, x) (__atomic_add_fetch((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_add_rel +#define pl_add_relc(ptr, x) (__atomic_add_fetch((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_add +#define pl_add(ptr, x) (__atomic_add_fetch((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* subtract integer constant <x> from integer value pointed to by pointer + * <ptr>, no return. Size of <x> is not checked. + */ +#ifndef pl_sub_lax +#define pl_sub_lax(ptr, x) (__atomic_sub_fetch((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_sub_acq +#define pl_sub_acq(ptr, x) (__atomic_sub_fetch((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_sub_rel +#define pl_sub_relc(ptr, x) (__atomic_sub_fetch((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_sub +#define pl_sub(ptr, x) (__atomic_sub_fetch((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* binary and integer value pointed to by pointer <ptr> with constant <x>, no + * return. Size of <x> is not checked. + */ +#ifndef pl_and_lax +#define pl_and_lax(ptr, x) (__atomic_and_fetch((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_and_acq +#define pl_and_acq(ptr, x) (__atomic_and_fetch((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_and_rel +#define pl_and_relc(ptr, x) (__atomic_and_fetch((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_and +#define pl_and(ptr, x) (__atomic_and_fetch((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* binary or integer value pointed to by pointer <ptr> with constant <x>, no + * return. Size of <x> is not checked. + */ +#ifndef pl_or_lax +#define pl_or_lax(ptr, x) (__atomic_or_fetch((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_or_acq +#define pl_or_acq(ptr, x) (__atomic_or_fetch((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_or_rel +#define pl_or_relc(ptr, x) (__atomic_or_fetch((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_or +#define pl_or(ptr, x) (__atomic_or_fetch((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* binary xor integer value pointed to by pointer <ptr> with constant <x>, no + * return. Size of <x> is not checked. + */ +#ifndef pl_xor_lax +#define pl_xor_lax(ptr, x) (__atomic_xor_fetch((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_xor_acq +#define pl_xor_acq(ptr, x) (__atomic_xor_fetch((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_xor_rel +#define pl_xor_relc(ptr, x) (__atomic_xor_fetch((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_xor +#define pl_xor(ptr, x) (__atomic_xor_fetch((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* fetch-and-add: fetch integer value pointed to by pointer <ptr>, add <x> to + * to <*ptr> and return the previous value. + * => THIS IS LEGACY, USE pl_ldadd() INSTEAD. + */ +#ifndef pl_xadd +#define pl_xadd(ptr, x) (__atomic_fetch_add((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* exchange value <x> with integer value pointed to by pointer <ptr>, and return + * previous <*ptr> value. <x> must be of the same size as <*ptr>. + */ +#ifndef pl_xchg +#define pl_xchg(ptr, x) (__atomic_exchange_n((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +/* compare integer value <*ptr> with <old> and exchange it with <new> if + * it matches, and return <old>. <old> and <new> must be of the same size as + * <*ptr>. + */ +#ifndef pl_cmpxchg +#define pl_cmpxchg(ptr, old, new) ({ \ + typeof(*ptr) __old = (old); \ + __atomic_compare_exchange_n((ptr), &__old, (new), 0, __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); \ + __old; }) +#endif + +/* fetch-and-add: fetch integer value pointed to by pointer <ptr>, add <x> to + * to <*ptr> and return the previous value. + */ +#ifndef pl_ldadd_lax +#define pl_ldadd_lax(ptr, x) (__atomic_fetch_add((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_ldadd_acq +#define pl_ldadd_acq(ptr, x) (__atomic_fetch_add((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_ldadd_rel +#define pl_ldadd_relc(ptr, x) (__atomic_fetch_add((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_ldadd +#define pl_ldadd(ptr, x) (__atomic_fetch_add((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + + +#ifndef pl_ldand_lax +#define pl_ldand_lax(ptr, x) (__atomic_fetch_and((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_ldand_acq +#define pl_ldand_acq(ptr, x) (__atomic_fetch_and((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_ldand_rel +#define pl_ldand_relc(ptr, x) (__atomic_fetch_and((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_ldand +#define pl_ldand(ptr, x) (__atomic_fetch_and((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + + +#ifndef pl_ldor_lax +#define pl_ldor_lax(ptr, x) (__atomic_fetch_or((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_ldor_acq +#define pl_ldor_acq(ptr, x) (__atomic_fetch_or((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_ldor_rel +#define pl_ldor_relc(ptr, x) (__atomic_fetch_or((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_ldor +#define pl_ldor(ptr, x) (__atomic_fetch_or((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + + +#ifndef pl_ldsub_lax +#define pl_ldsub_lax(ptr, x) (__atomic_fetch_sub((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_ldsub_acq +#define pl_ldsub_acq(ptr, x) (__atomic_fetch_sub((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_ldsub_rel +#define pl_ldsub_relc(ptr, x) (__atomic_fetch_sub((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_ldsub +#define pl_ldsub(ptr, x) (__atomic_fetch_sub((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + + +#ifndef pl_ldxor_lax +#define pl_ldxor_lax(ptr, x) (__atomic_fetch_xor((ptr), (x), __ATOMIC_RELAXED)) +#endif + +#ifndef pl_ldxor_acq +#define pl_ldxor_acq(ptr, x) (__atomic_fetch_xor((ptr), (x), __ATOMIC_ACQUIRE)) +#endif + +#ifndef pl_ldxor_rel +#define pl_ldxor_relc(ptr, x) (__atomic_fetch_xor((ptr), (x), __ATOMIC_RELEASE)) +#endif + +#ifndef pl_ldxor +#define pl_ldxor(ptr, x) (__atomic_fetch_xor((ptr), (x), __ATOMIC_SEQ_CST)) +#endif + +#endif /* end of C11 atomics */ + + +/* + * Automatically remap to fallback code when available. This allows the arch + * specific code above to be used as an immediate fallback for missing C11 + * definitions. Everything not defined will use the generic code at the end. + */ + +#if !defined(pl_cpu_relax) && defined(_pl_cpu_relax) +# define pl_cpu_relax _pl_cpu_relax +#endif + +#if !defined(pl_barrier) && defined(_pl_barrier) +# define pl_barrier _pl_barrier +#endif + +#if !defined(pl_mb) && defined(_pl_mb) +# define pl_mb _pl_mb +#endif + +#if !defined(pl_mb_load) && defined(_pl_mb_load) +# define pl_mb_load _pl_mb_load +#endif + +#if !defined(pl_mb_store) && defined(_pl_mb_store) +# define pl_mb_store _pl_mb_store +#endif + +#if !defined(pl_mb_ato) && defined(_pl_mb_ato) +# define pl_mb_ato _pl_mb_ato +#endif + +#if !defined(pl_mb_ato_load) && defined(_pl_mb_ato_load) +# define pl_mb_ato_load _pl_mb_ato_load +#endif + +#if !defined(pl_mb_ato_store) && defined(_pl_mb_ato_store) +# define pl_mb_ato_store _pl_mb_ato_store +#endif + + +#if !defined(pl_load) && defined(_pl_load) +#define pl_load _pl_load +#endif + +#if !defined(pl_load_lax) && defined(_pl_load_lax) +#define pl_load_lax _pl_load_lax +#endif + +#if !defined(pl_store) && defined(_pl_store) +#define pl_store _pl_store +#endif + +#if !defined(pl_store_lax) && defined(_pl_store_lax) +#define pl_store_lax _pl_store_lax +#endif + + +#if !defined(pl_inc_noret_lax) && defined(_pl_inc_noret_lax) +# define pl_inc_noret_lax _pl_inc_noret_lax +#endif + +#if !defined(pl_inc_noret_acq) && defined(_pl_inc_noret_acq) +# define pl_inc_noret_acq _pl_inc_noret_acq +#endif + +#if !defined(pl_inc_noret_rel) && defined(_pl_inc_noret_rel) +# define pl_inc_noret_rel _pl_inc_noret_rel +#endif + +#if !defined(pl_inc_noret) && defined(_pl_inc_noret) +# define pl_inc_noret _pl_inc_noret +#endif + + +#if !defined(pl_dec_noret_lax) && defined(_pl_dec_noret_lax) +# define pl_dec_noret_lax _pl_dec_noret_lax +#endif + +#if !defined(pl_dec_noret_acq) && defined(_pl_dec_noret_acq) +# define pl_dec_noret_acq _pl_dec_noret_acq +#endif + +#if !defined(pl_dec_noret_rel) && defined(_pl_dec_noret_rel) +# define pl_dec_noret_rel _pl_dec_noret_rel +#endif + +#if !defined(pl_dec_noret) && defined(_pl_dec_noret) +# define pl_dec_noret _pl_dec_noret +#endif + + +#if !defined(pl_inc_lax) && defined(_pl_inc_lax) +# define pl_inc_lax _pl_inc_lax +#endif + +#if !defined(pl_inc_acq) && defined(_pl_inc_acq) +# define pl_inc_acq _pl_inc_acq +#endif + +#if !defined(pl_inc_rel) && defined(_pl_inc_rel) +# define pl_inc_rel _pl_inc_rel +#endif + +#if !defined(pl_inc) && defined(_pl_inc) +# define pl_inc _pl_inc +#endif + + +#if !defined(pl_dec_lax) && defined(_pl_dec_lax) +# define pl_dec_lax _pl_dec_lax +#endif + +#if !defined(pl_dec_acq) && defined(_pl_dec_acq) +# define pl_dec_acq _pl_dec_acq +#endif + +#if !defined(pl_dec_rel) && defined(_pl_dec_rel) +# define pl_dec_rel _pl_dec_rel +#endif + +#if !defined(pl_dec) && defined(_pl_dec) +# define pl_dec _pl_dec +#endif + + +#if !defined(pl_add_lax) && defined(_pl_add_lax) +# define pl_add_lax _pl_add_lax +#endif + +#if !defined(pl_add_acq) && defined(_pl_add_acq) +# define pl_add_acq _pl_add_acq +#endif + +#if !defined(pl_add_rel) && defined(_pl_add_rel) +# define pl_add_rel _pl_add_rel +#endif + +#if !defined(pl_add) && defined(_pl_add) +# define pl_add _pl_add +#endif + + +#if !defined(pl_add_noret_lax) && defined(_pl_add_noret_lax) +# define pl_add_noret_lax _pl_add_noret_lax +#endif + +#if !defined(pl_add_noret_acq) && defined(_pl_add_noret_acq) +# define pl_add_noret_acq _pl_add_noret_acq +#endif + +#if !defined(pl_add_noret_rel) && defined(_pl_add_noret_rel) +# define pl_add_noret_rel _pl_add_noret_rel +#endif + +#if !defined(pl_add_noret) && defined(_pl_add_noret) +# define pl_add_noret _pl_add_noret +#endif + +#if !defined(pl_and_lax) && defined(_pl_and_lax) +# define pl_and_lax _pl_and_lax +#endif + +#if !defined(pl_and_acq) && defined(_pl_and_acq) +# define pl_and_acq _pl_and_acq +#endif + +#if !defined(pl_and_rel) && defined(_pl_and_rel) +# define pl_and_rel _pl_and_rel +#endif + +#if !defined(pl_and) && defined(_pl_and) +# define pl_and _pl_and +#endif + + +#if !defined(pl_and_noret_lax) && defined(_pl_and_noret_lax) +# define pl_and_noret_lax _pl_and_noret_lax +#endif + +#if !defined(pl_and_noret_acq) && defined(_pl_and_noret_acq) +# define pl_and_noret_acq _pl_and_noret_acq +#endif + +#if !defined(pl_and_noret_rel) && defined(_pl_and_noret_rel) +# define pl_and_noret_rel _pl_and_noret_rel +#endif + +#if !defined(pl_and_noret) && defined(_pl_and_noret) +# define pl_and_noret _pl_and_noret +#endif + + +#if !defined(pl_or_lax) && defined(_pl_or_lax) +# define pl_or_lax _pl_or_lax +#endif + +#if !defined(pl_or_acq) && defined(_pl_or_acq) +# define pl_or_acq _pl_or_acq +#endif + +#if !defined(pl_or_rel) && defined(_pl_or_rel) +# define pl_or_rel _pl_or_rel +#endif + +#if !defined(pl_or) && defined(_pl_or) +# define pl_or _pl_or +#endif + + +#if !defined(pl_or_noret_lax) && defined(_pl_or_noret_lax) +# define pl_or_noret_lax _pl_or_noret_lax +#endif + +#if !defined(pl_or_noret_acq) && defined(_pl_or_noret_acq) +# define pl_or_noret_acq _pl_or_noret_acq +#endif + +#if !defined(pl_or_noret_rel) && defined(_pl_or_noret_rel) +# define pl_or_noret_rel _pl_or_noret_rel +#endif + +#if !defined(pl_or_noret) && defined(_pl_or_noret) +# define pl_or_noret _pl_or_noret +#endif + + +#if !defined(pl_xor_lax) && defined(_pl_xor_lax) +# define pl_xor_lax _pl_xor_lax +#endif + +#if !defined(pl_xor_acq) && defined(_pl_xor_acq) +# define pl_xor_acq _pl_xor_acq +#endif + +#if !defined(pl_xor_rel) && defined(_pl_xor_rel) +# define pl_xor_rel _pl_xor_rel +#endif + +#if !defined(pl_xor) && defined(_pl_xor) +# define pl_xor _pl_xor +#endif + + +#if !defined(pl_xor_noret_lax) && defined(_pl_xor_noret_lax) +# define pl_xor_noret_lax _pl_xor_noret_lax +#endif + +#if !defined(pl_xor_noret_acq) && defined(_pl_xor_noret_acq) +# define pl_xor_noret_acq _pl_xor_noret_acq +#endif + +#if !defined(pl_xor_noret_rel) && defined(_pl_xor_noret_rel) +# define pl_xor_noret_rel _pl_xor_noret_rel +#endif + +#if !defined(pl_xor_noret) && defined(_pl_xor_noret) +# define pl_xor_noret _pl_xor_noret +#endif + + +#if !defined(pl_sub_lax) && defined(_pl_sub_lax) +# define pl_sub_lax _pl_sub_lax +#endif + +#if !defined(pl_sub_acq) && defined(_pl_sub_acq) +# define pl_sub_acq _pl_sub_acq +#endif + +#if !defined(pl_sub_rel) && defined(_pl_sub_rel) +# define pl_sub_rel _pl_sub_rel +#endif + +#if !defined(pl_sub) && defined(_pl_sub) +# define pl_sub _pl_sub +#endif + + +#if !defined(pl_sub_noret_lax) && defined(_pl_sub_noret_lax) +# define pl_sub_noret_lax _pl_sub_noret_lax +#endif + +#if !defined(pl_sub_noret_acq) && defined(_pl_sub_noret_acq) +# define pl_sub_noret_acq _pl_sub_noret_acq +#endif + +#if !defined(pl_sub_noret_rel) && defined(_pl_sub_noret_rel) +# define pl_sub_noret_rel _pl_sub_noret_rel +#endif + +#if !defined(pl_sub_noret) && defined(_pl_sub_noret) +# define pl_sub_noret _pl_sub_noret +#endif + + +#if !defined(pl_btr_lax) && defined(_pl_btr_lax) +# define pl_btr_lax _pl_btr_lax +#endif + +#if !defined(pl_btr_acq) && defined(_pl_btr_acq) +# define pl_btr_acq _pl_btr_acq +#endif + +#if !defined(pl_btr_rel) && defined(_pl_btr_rel) +# define pl_btr_rel _pl_btr_rel +#endif + +#if !defined(pl_btr) && defined(_pl_btr) +# define pl_btr _pl_btr +#endif + + +#if !defined(pl_bts_lax) && defined(_pl_bts_lax) +# define pl_bts_lax _pl_bts_lax +#endif + +#if !defined(pl_bts_acq) && defined(_pl_bts_acq) +# define pl_bts_acq _pl_bts_acq +#endif + +#if !defined(pl_bts_rel) && defined(_pl_bts_rel) +# define pl_bts_rel _pl_bts_rel +#endif + +#if !defined(pl_bts) && defined(_pl_bts) +# define pl_bts _pl_bts +#endif + + +#if !defined(pl_xadd) && defined(_pl_xadd) +# define pl_xadd _pl_xadd +#endif + +#if !defined(pl_cmpxchg) && defined(_pl_cmpxchg) +# define pl_cmpxchg _pl_cmpxchg +#endif + +#if !defined(pl_xchg) && defined(_pl_xchg) +# define pl_xchg _pl_xchg +#endif + + +#if !defined(pl_ldadd_lax) && defined(_pl_ldadd_lax) +# define pl_ldadd_lax _pl_ldadd_lax +#endif + +#if !defined(pl_ldadd_acq) && defined(_pl_ldadd_acq) +# define pl_ldadd_acq _pl_ldadd_acq +#endif + +#if !defined(pl_ldadd_rel) && defined(_pl_ldadd_rel) +# define pl_ldadd_rel _pl_ldadd_rel +#endif + +#if !defined(pl_ldadd) && defined(_pl_ldadd) +# define pl_ldadd _pl_ldadd +#endif + + +#if !defined(pl_ldand_lax) && defined(_pl_ldand_lax) +# define pl_ldand_lax _pl_ldand_lax +#endif + +#if !defined(pl_ldand_acq) && defined(_pl_ldand_acq) +# define pl_ldand_acq _pl_ldand_acq +#endif + +#if !defined(pl_ldand_rel) && defined(_pl_ldand_rel) +# define pl_ldand_rel _pl_ldand_rel +#endif + +#if !defined(pl_ldand) && defined(_pl_ldand) +# define pl_ldand _pl_ldand +#endif + + +#if !defined(pl_ldor_lax) && defined(_pl_ldor_lax) +# define pl_ldor_lax _pl_ldor_lax +#endif + +#if !defined(pl_ldor_acq) && defined(_pl_ldor_acq) +# define pl_ldor_acq _pl_ldor_acq +#endif + +#if !defined(pl_ldor_rel) && defined(_pl_ldor_rel) +# define pl_ldor_rel _pl_ldor_rel +#endif + +#if !defined(pl_ldor) && defined(_pl_ldor) +# define pl_ldor _pl_ldor +#endif + + +#if !defined(pl_ldxor_lax) && defined(_pl_ldxor_lax) +# define pl_ldxor_lax _pl_ldxor_lax +#endif + +#if !defined(pl_ldxor_acq) && defined(_pl_ldxor_acq) +# define pl_ldxor_acq _pl_ldxor_acq +#endif + +#if !defined(pl_ldxor_rel) && defined(_pl_ldxor_rel) +# define pl_ldxor_rel _pl_ldxor_rel +#endif + +#if !defined(pl_ldxor) && defined(_pl_ldxor) +# define pl_ldxor _pl_ldxor +#endif + + +#if !defined(pl_ldsub_lax) && defined(_pl_ldsub_lax) +# define pl_ldsub_lax _pl_ldsub_lax +#endif + +#if !defined(pl_ldsub_acq) && defined(_pl_ldsub_acq) +# define pl_ldsub_acq _pl_ldsub_acq +#endif + +#if !defined(pl_ldsub_rel) && defined(_pl_ldsub_rel) +# define pl_ldsub_rel _pl_ldsub_rel +#endif + +#if !defined(pl_ldsub) && defined(_pl_ldsub) +# define pl_ldsub _pl_ldsub +#endif + + +/* + * Generic code using the __sync API for everything not defined above. + */ + + +/* CPU relaxation while waiting */ +#ifndef pl_cpu_relax +#define pl_cpu_relax() do { \ + asm volatile(""); \ + } while (0) +#endif + +/* compiler-only memory barrier, for use around locks */ +#ifndef pl_barrier +#define pl_barrier() do { \ + asm volatile("" ::: "memory"); \ + } while (0) +#endif + +/* full memory barrier */ +#ifndef pl_mb +#define pl_mb() do { \ + __sync_synchronize(); \ + } while (0) +#endif + +#ifndef pl_mb_load +#define pl_mb_load() pl_mb() +#endif + +#ifndef pl_mb_store +#define pl_mb_store() pl_mb() +#endif + +#ifndef pl_mb_ato +#define pl_mb_ato() pl_mb() +#endif + +#ifndef pl_mb_ato_load +#define pl_mb_ato_load() pl_mb_ato() +#endif + +#ifndef pl_mb_ato_store +#define pl_mb_ato_store() pl_mb_ato() +#endif + +/* atomic load: volatile after a load barrier */ +#ifndef pl_load +#define pl_load(ptr) ({ \ + typeof(*(ptr)) __pl_ret = ({ \ + pl_mb_load(); \ + *(volatile typeof(ptr))ptr; \ + }); \ + __pl_ret; \ + }) +#endif + +/* atomic store, old style using a CAS */ +#ifndef pl_store +#define pl_store(ptr, x) do { \ + typeof((ptr)) __pl_ptr = (ptr); \ + typeof((x)) __pl_x = (x); \ + typeof(*(ptr)) __pl_old; \ + do { \ + __pl_old = *__pl_ptr; \ + } while (!__sync_bool_compare_and_swap(__pl_ptr, __pl_old, __pl_x)); \ + } while (0) +#endif + +#ifndef pl_inc_noret +#define pl_inc_noret(ptr) do { __sync_add_and_fetch((ptr), 1); } while (0) +#endif + +#ifndef pl_dec_noret +#define pl_dec_noret(ptr) do { __sync_sub_and_fetch((ptr), 1); } while (0) +#endif + +#ifndef pl_inc +#define pl_inc(ptr) ({ __sync_add_and_fetch((ptr), 1); }) +#endif + +#ifndef pl_dec +#define pl_dec(ptr) ({ __sync_sub_and_fetch((ptr), 1); }) +#endif + +#ifndef pl_add +#define pl_add(ptr, x) ({ __sync_add_and_fetch((ptr), (x)); }) +#endif + +#ifndef pl_and +#define pl_and(ptr, x) ({ __sync_and_and_fetch((ptr), (x)); }) +#endif + +#ifndef pl_or +#define pl_or(ptr, x) ({ __sync_or_and_fetch((ptr), (x)); }) +#endif + +#ifndef pl_xor +#define pl_xor(ptr, x) ({ __sync_xor_and_fetch((ptr), (x)); }) +#endif + +#ifndef pl_sub +#define pl_sub(ptr, x) ({ __sync_sub_and_fetch((ptr), (x)); }) +#endif + +#ifndef pl_btr +#define pl_btr(ptr, bit) ({ typeof(*(ptr)) __pl_t = ((typeof(*(ptr)))1) << (bit); \ + __sync_fetch_and_and((ptr), ~__pl_t) & __pl_t; \ + }) +#endif + +#ifndef pl_bts +#define pl_bts(ptr, bit) ({ typeof(*(ptr)) __pl_t = ((typeof(*(ptr)))1) << (bit); \ + __sync_fetch_and_or((ptr), __pl_t) & __pl_t; \ + }) +#endif + +#ifndef pl_xadd +#define pl_xadd(ptr, x) ({ __sync_fetch_and_add((ptr), (x)); }) +#endif + +#ifndef pl_cmpxchg +#define pl_cmpxchg(ptr, o, n) ({ __sync_val_compare_and_swap((ptr), (o), (n)); }) +#endif + +#ifndef pl_xchg +#define pl_xchg(ptr, x) ({ \ + typeof((ptr)) __pl_ptr = (ptr); \ + typeof((x)) __pl_x = (x); \ + typeof(*(ptr)) __pl_old; \ + do { \ + __pl_old = *__pl_ptr; \ + } while (!__sync_bool_compare_and_swap(__pl_ptr, __pl_old, __pl_x)); \ + __pl_old; \ + }) +#endif + +#ifndef pl_ldadd +#define pl_ldadd(ptr, x) ({ __sync_fetch_and_add((ptr), (x)); }) +#endif + +#ifndef pl_ldand +#define pl_ldand(ptr, x) ({ __sync_fetch_and_and((ptr), (x)); }) +#endif + +#ifndef pl_ldor +#define pl_ldor(ptr, x) ({ __sync_fetch_and_or((ptr), (x)); }) +#endif + +#ifndef pl_ldxor +#define pl_ldxor(ptr, x) ({ __sync_fetch_and_xor((ptr), (x)); }) +#endif + +#ifndef pl_ldsub +#define pl_ldsub(ptr, x) ({ __sync_fetch_and_sub((ptr), (x)); }) +#endif + +/* certain _noret operations may be defined from the regular ones */ +#if !defined(pl_inc_noret) && defined(pl_inc) +# define pl_inc_noret(ptr) (void)pl_inc(ptr) +#endif + +#if !defined(pl_dec_noret) && defined(pl_dec) +# define pl_dec_noret(ptr) (void)pl_dec(ptr) +#endif + +#if !defined(pl_add_noret) && defined(pl_add) +# define pl_add_noret(ptr, x) (void)pl_add(ptr, x) +#endif + +#if !defined(pl_sub_noret) && defined(pl_sub) +# define pl_sub_noret(ptr, x) (void)pl_sub(ptr, x) +#endif + +#if !defined(pl_or_noret) && defined(pl_or) +# define pl_or_noret(ptr, x) (void)pl_or(ptr, x) +#endif + +#if !defined(pl_and_noret) && defined(pl_and) +# define pl_and_noret(ptr, x) (void)pl_and(ptr, x) +#endif + +#if !defined(pl_xor_noret) && defined(pl_xor) +# define pl_xor_noret(ptr, x) (void)pl_xor(ptr, x) +#endif + +/* certain memory orders may fallback to the generic seq_cst definition */ + +#if !defined(pl_load_lax) && defined(pl_load) +#define pl_load_lax pl_load +#endif + + +#if !defined(pl_store_lax) && defined(pl_store) +#define pl_store_lax pl_store +#endif + + +#if !defined(pl_inc_lax) && defined(pl_inc) +# define pl_inc_lax pl_inc +#endif +#if !defined(pl_inc_acq) && defined(pl_inc) +# define pl_inc_acq pl_inc +#endif +#if !defined(pl_inc_rel) && defined(pl_inc) +# define pl_inc_rel pl_inc +#endif + + +#if !defined(pl_dec_lax) && defined(pl_dec) +# define pl_dec_lax pl_dec +#endif +#if !defined(pl_dec_acq) && defined(pl_dec) +# define pl_dec_acq pl_dec +#endif + +#if !defined(pl_dec_rel) && defined(pl_dec) +# define pl_dec_rel pl_dec +#endif + + +#if !defined(pl_inc_noret_lax) && defined(pl_inc_noret) +# define pl_inc_noret_lax pl_inc_noret +#endif + +#if !defined(pl_inc_noret_acq) && defined(pl_inc_noret) +# define pl_inc_noret_acq pl_inc_noret +#endif + +#if !defined(pl_inc_noret_rel) && defined(pl_inc_noret) +# define pl_inc_noret_rel pl_inc_noret +#endif + + +#if !defined(pl_dec_noret_lax) && defined(pl_dec_noret) +# define pl_dec_noret_lax pl_dec_noret +#endif + +#if !defined(pl_dec_noret_acq) && defined(pl_dec_noret) +# define pl_dec_noret_acq pl_dec_noret +#endif + +#if !defined(pl_dec_noret_rel) && defined(pl_dec_noret) +# define pl_dec_noret_rel pl_dec_noret +#endif + + +#if !defined(pl_add_lax) && defined(pl_add) +# define pl_add_lax pl_add +#endif + +#if !defined(pl_add_acq) && defined(pl_add) +# define pl_add_acq pl_add +#endif + +#if !defined(pl_add_rel) && defined(pl_add) +# define pl_add_rel pl_add +#endif + + +#if !defined(pl_sub_lax) && defined(pl_sub) +# define pl_sub_lax pl_sub +#endif + +#if !defined(pl_sub_acq) && defined(pl_sub) +# define pl_sub_acq pl_sub +#endif + +#if !defined(pl_sub_rel) && defined(pl_sub) +# define pl_sub_rel pl_sub +#endif + + +#if !defined(pl_and_lax) && defined(pl_and) +# define pl_and_lax pl_and +#endif + +#if !defined(pl_and_acq) && defined(pl_and) +# define pl_and_acq pl_and +#endif + +#if !defined(pl_and_rel) && defined(pl_and) +# define pl_and_rel pl_and +#endif + + +#if !defined(pl_or_lax) && defined(pl_or) +# define pl_or_lax pl_or +#endif + +#if !defined(pl_or_acq) && defined(pl_or) +# define pl_or_acq pl_or +#endif + +#if !defined(pl_or_rel) && defined(pl_or) +# define pl_or_rel pl_or +#endif + + +#if !defined(pl_xor_lax) && defined(pl_xor) +# define pl_xor_lax pl_xor +#endif + +#if !defined(pl_xor_acq) && defined(pl_xor) +# define pl_xor_acq pl_xor +#endif + +#if !defined(pl_xor_rel) && defined(pl_xor) +# define pl_xor_rel pl_xor +#endif + + +#if !defined(pl_add_noret_lax) && defined(pl_add_noret) +# define pl_add_noret_lax pl_add_noret +#endif + +#if !defined(pl_add_noret_acq) && defined(pl_add_noret) +# define pl_add_noret_acq pl_add_noret +#endif + +#if !defined(pl_add_noret_rel) && defined(pl_add_noret) +# define pl_add_noret_rel pl_add_noret +#endif + + +#if !defined(pl_sub_noret_lax) && defined(pl_sub_noret) +# define pl_sub_noret_lax pl_sub_noret +#endif + +#if !defined(pl_sub_noret_acq) && defined(pl_sub_noret) +# define pl_sub_noret_acq pl_sub_noret +#endif + +#if !defined(pl_sub_noret_rel) && defined(pl_sub_noret) +# define pl_sub_noret_rel pl_sub_noret +#endif + + +#if !defined(pl_and_noret_lax) && defined(pl_and_noret) +# define pl_and_noret_lax pl_and_noret +#endif + +#if !defined(pl_and_noret_acq) && defined(pl_and_noret) +# define pl_and_noret_acq pl_and_noret +#endif + +#if !defined(pl_and_noret_rel) && defined(pl_and_noret) +# define pl_and_noret_rel pl_and_noret +#endif + + +#if !defined(pl_or_noret_lax) && defined(pl_or_noret) +# define pl_or_noret_lax pl_or_noret +#endif + +#if !defined(pl_or_noret_acq) && defined(pl_or_noret) +# define pl_or_noret_acq pl_or_noret +#endif + +#if !defined(pl_or_noret_rel) && defined(pl_or_noret) +# define pl_or_noret_rel pl_or_noret +#endif + + +#if !defined(pl_xor_noret_lax) && defined(pl_xor_noret) +# define pl_xor_noret_lax pl_xor_noret +#endif + +#if !defined(pl_xor_noret_acq) && defined(pl_xor_noret) +# define pl_xor_noret_acq pl_xor_noret +#endif + +#if !defined(pl_xor_noret_rel) && defined(pl_xor_noret) +# define pl_xor_noret_rel pl_xor_noret +#endif + + +#if !defined(pl_btr_lax) && defined(pl_btr) +# define pl_btr_lax pl_btr +#endif + +#if !defined(pl_btr_acq) && defined(pl_btr) +# define pl_btr_acq pl_btr +#endif + +#if !defined(pl_btr_rel) && defined(pl_btr) +# define pl_btr_rel pl_btr +#endif + + +#if !defined(pl_bts_lax) && defined(pl_bts) +# define pl_bts_lax pl_bts +#endif + +#if !defined(pl_bts_acq) && defined(pl_bts) +# define pl_bts_acq pl_bts +#endif + +#if !defined(pl_bts_rel) && defined(pl_bts) +# define pl_bts_rel pl_bts +#endif + + +#if !defined(pl_ldadd_lax) && defined(pl_ldadd) +# define pl_ldadd_lax pl_ldadd +#endif + +#if !defined(pl_ldadd_acq) && defined(pl_ldadd) +# define pl_ldadd_acq pl_ldadd +#endif + +#if !defined(pl_ldadd_rel) && defined(pl_ldadd) +# define pl_ldadd_rel pl_ldadd +#endif + + +#if !defined(pl_ldsub_lax) && defined(pl_ldsub) +# define pl_ldsub_lax pl_ldsub +#endif + +#if !defined(pl_ldsub_acq) && defined(pl_ldsub) +# define pl_ldsub_acq pl_ldsub +#endif + +#if !defined(pl_ldsub_rel) && defined(pl_ldsub) +# define pl_ldsub_rel pl_ldsub +#endif + + +#if !defined(pl_ldand_lax) && defined(pl_ldand) +# define pl_ldand_lax pl_ldand +#endif + +#if !defined(pl_ldand_acq) && defined(pl_ldand) +# define pl_ldand_acq pl_ldand +#endif + +#if !defined(pl_ldand_rel) && defined(pl_ldand) +# define pl_ldand_rel pl_ldand +#endif + + +#if !defined(pl_ldor_lax) && defined(pl_ldor) +# define pl_ldor_lax pl_ldor +#endif + +#if !defined(pl_ldor_acq) && defined(pl_ldor) +# define pl_ldor_acq pl_ldor +#endif + +#if !defined(pl_ldor_rel) && defined(pl_ldor) +# define pl_ldor_rel pl_ldor +#endif + + +#if !defined(pl_ldxor_lax) && defined(pl_ldxor) +# define pl_ldxor_lax pl_ldxor +#endif + +#if !defined(pl_ldxor_acq) && defined(pl_ldxor) +# define pl_ldxor_acq pl_ldxor +#endif + +#if !defined(pl_ldxor_rel) && defined(pl_ldxor) +# define pl_ldxor_rel pl_ldxor +#endif + + +#endif /* PL_ATOMIC_OPS_H */ diff --git a/include/import/eb32sctree.h b/include/import/eb32sctree.h new file mode 100644 index 0000000..5ace662 --- /dev/null +++ b/include/import/eb32sctree.h @@ -0,0 +1,121 @@ +/* + * Elastic Binary Trees - macros and structures for operations on 32bit nodes. + * Version 6.0.6 with backports from v7-dev + * (C) 2002-2017 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EB32SCTREE_H +#define _EB32SCTREE_H + +#include "ebtree.h" + + +/* Return the structure of type <type> whose member <member> points to <ptr> */ +#define eb32sc_entry(ptr, type, member) container_of(ptr, type, member) + +/* + * Exported functions and macros. + * Many of them are always inlined because they are extremely small, and + * are generally called at most once or twice in a program. + */ + +/* + * The following functions are not inlined by default. They are declared + * in eb32sctree.c, which simply relies on their inline version. + */ +struct eb32sc_node *eb32sc_lookup_ge(struct eb_root *root, u32 x, unsigned long scope); +struct eb32sc_node *eb32sc_lookup_ge_or_first(struct eb_root *root, u32 x, unsigned long scope); +struct eb32sc_node *eb32sc_insert(struct eb_root *root, struct eb32sc_node *new, unsigned long scope); +void eb32sc_delete(struct eb32sc_node *node); + +/* Walks down left starting at root pointer <start>, and follow the leftmost + * branch whose scope matches <scope>. It either returns the node hosting the + * first leaf on that side, or NULL if no leaf is found. <start> may either be + * NULL or a branch pointer. The pointer to the leaf (or NULL) is returned. + */ +static inline struct eb32sc_node *eb32sc_walk_down_left(eb_troot_t *start, unsigned long scope) +{ + struct eb_root *root; + struct eb_node *node; + struct eb32sc_node *eb32; + + if (unlikely(!start)) + return NULL; + + while (1) { + if (eb_gettag(start) == EB_NODE) { + root = eb_untag(start, EB_NODE); + node = eb_root_to_node(root); + eb32 = container_of(node, struct eb32sc_node, node); + if (eb32->node_s & scope) { + start = node->branches.b[EB_LEFT]; + continue; + } + start = node->node_p; + } + else { + root = eb_untag(start, EB_LEAF); + node = eb_root_to_node(root); + eb32 = container_of(node, struct eb32sc_node, node); + if (eb32->leaf_s & scope) + return eb32; + start = node->leaf_p; + } + + /* here we're on a node that doesn't match the scope. We have + * to walk to the closest right location. + */ + while (eb_gettag(start) != EB_LEFT) + /* Walking up from right branch, so we cannot be below root */ + start = (eb_root_to_node(eb_untag(start, EB_RGHT)))->node_p; + + /* Note that <start> cannot be NULL at this stage */ + root = eb_untag(start, EB_LEFT); + start = root->b[EB_RGHT]; + if (eb_clrtag(start) == NULL) + return NULL; + } +} + +/* Return next node in the tree, starting with tagged parent <start>, or NULL if none */ +static inline struct eb32sc_node *eb32sc_next_with_parent(eb_troot_t *start, unsigned long scope) +{ + while (eb_gettag(start) != EB_LEFT) + /* Walking up from right branch, so we cannot be below root */ + start = (eb_root_to_node(eb_untag(start, EB_RGHT)))->node_p; + + /* Note that <t> cannot be NULL at this stage */ + start = (eb_untag(start, EB_LEFT))->b[EB_RGHT]; + if (eb_clrtag(start) == NULL) + return NULL; + + return eb32sc_walk_down_left(start, scope); +} + +/* Return next node in the tree, or NULL if none */ +static inline struct eb32sc_node *eb32sc_next(struct eb32sc_node *eb32, unsigned long scope) +{ + return eb32sc_next_with_parent(eb32->node.leaf_p, scope); +} + +/* Return leftmost node in the tree, or NULL if none */ +static inline struct eb32sc_node *eb32sc_first(struct eb_root *root, unsigned long scope) +{ + return eb32sc_walk_down_left(root->b[0], scope); +} + +#endif /* _EB32SC_TREE_H */ diff --git a/include/import/eb32tree.h b/include/import/eb32tree.h new file mode 100644 index 0000000..1c03fc1 --- /dev/null +++ b/include/import/eb32tree.h @@ -0,0 +1,482 @@ +/* + * Elastic Binary Trees - macros and structures for operations on 32bit nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EB32TREE_H +#define _EB32TREE_H + +#include "ebtree.h" + + +/* Return the structure of type <type> whose member <member> points to <ptr> */ +#define eb32_entry(ptr, type, member) container_of(ptr, type, member) + +/* + * Exported functions and macros. + * Many of them are always inlined because they are extremely small, and + * are generally called at most once or twice in a program. + */ + +/* Return leftmost node in the tree, or NULL if none */ +static inline struct eb32_node *eb32_first(struct eb_root *root) +{ + return eb32_entry(eb_first(root), struct eb32_node, node); +} + +/* Return rightmost node in the tree, or NULL if none */ +static inline struct eb32_node *eb32_last(struct eb_root *root) +{ + return eb32_entry(eb_last(root), struct eb32_node, node); +} + +/* Return next node in the tree, or NULL if none */ +static inline struct eb32_node *eb32_next(struct eb32_node *eb32) +{ + return eb32_entry(eb_next(&eb32->node), struct eb32_node, node); +} + +/* Return previous node in the tree, or NULL if none */ +static inline struct eb32_node *eb32_prev(struct eb32_node *eb32) +{ + return eb32_entry(eb_prev(&eb32->node), struct eb32_node, node); +} + +/* Return next leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct eb32_node *eb32_next_dup(struct eb32_node *eb32) +{ + return eb32_entry(eb_next_dup(&eb32->node), struct eb32_node, node); +} + +/* Return previous leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct eb32_node *eb32_prev_dup(struct eb32_node *eb32) +{ + return eb32_entry(eb_prev_dup(&eb32->node), struct eb32_node, node); +} + +/* Return next node in the tree, skipping duplicates, or NULL if none */ +static inline struct eb32_node *eb32_next_unique(struct eb32_node *eb32) +{ + return eb32_entry(eb_next_unique(&eb32->node), struct eb32_node, node); +} + +/* Return previous node in the tree, skipping duplicates, or NULL if none */ +static inline struct eb32_node *eb32_prev_unique(struct eb32_node *eb32) +{ + return eb32_entry(eb_prev_unique(&eb32->node), struct eb32_node, node); +} + +/* Delete node from the tree if it was linked in. Mark the node unused. Note + * that this function relies on a non-inlined generic function: eb_delete. + */ +static inline void eb32_delete(struct eb32_node *eb32) +{ + eb_delete(&eb32->node); +} + +/* + * The following functions are not inlined by default. They are declared + * in eb32tree.c, which simply relies on their inline version. + */ +struct eb32_node *eb32_lookup(struct eb_root *root, u32 x); +struct eb32_node *eb32i_lookup(struct eb_root *root, s32 x); +struct eb32_node *eb32_lookup_le(struct eb_root *root, u32 x); +struct eb32_node *eb32_lookup_ge(struct eb_root *root, u32 x); +struct eb32_node *eb32_insert(struct eb_root *root, struct eb32_node *new); +struct eb32_node *eb32i_insert(struct eb_root *root, struct eb32_node *new); + +/* + * The following functions are less likely to be used directly, because their + * code is larger. The non-inlined version is preferred. + */ + +/* Delete node from the tree if it was linked in. Mark the node unused. */ +static forceinline void __eb32_delete(struct eb32_node *eb32) +{ + __eb_delete(&eb32->node); +} + +/* + * Find the first occurrence of a key in the tree <root>. If none can be + * found, return NULL. + */ +static forceinline struct eb32_node *__eb32_lookup(struct eb_root *root, u32 x) +{ + struct eb32_node *node; + eb_troot_t *troot; + u32 y; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + if (node->key == x) + return node; + else + return NULL; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb32_node, node.branches); + node_bit = node->node.bit; + + y = node->key ^ x; + if (!y) { + /* Either we found the node which holds the key, or + * we have a dup tree. In the later case, we have to + * walk it down left to get the first entry. + */ + if (node_bit < 0) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + } + return node; + } + + if ((y >> node_bit) >= EB_NODE_BRANCHES) + return NULL; /* no more common bits */ + + troot = node->node.branches.b[(x >> node_bit) & EB_NODE_BRANCH_MASK]; + } +} + +/* + * Find the first occurrence of a signed key in the tree <root>. If none can + * be found, return NULL. + */ +static forceinline struct eb32_node *__eb32i_lookup(struct eb_root *root, s32 x) +{ + struct eb32_node *node; + eb_troot_t *troot; + u32 key = x ^ 0x80000000; + u32 y; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + if (node->key == (u32)x) + return node; + else + return NULL; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb32_node, node.branches); + node_bit = node->node.bit; + + y = node->key ^ x; + if (!y) { + /* Either we found the node which holds the key, or + * we have a dup tree. In the later case, we have to + * walk it down left to get the first entry. + */ + if (node_bit < 0) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + } + return node; + } + + if ((y >> node_bit) >= EB_NODE_BRANCHES) + return NULL; /* no more common bits */ + + troot = node->node.branches.b[(key >> node_bit) & EB_NODE_BRANCH_MASK]; + } +} + +/* Insert eb32_node <new> into subtree starting at node root <root>. + * Only new->key needs be set with the key. The eb32_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. + */ +static forceinline struct eb32_node * +__eb32_insert(struct eb_root *root, struct eb32_node *new) { + struct eb32_node *old; + unsigned int side; + eb_troot_t *troot, **up_ptr; + u32 newkey; /* caching the key saves approximately one cycle */ + eb_troot_t *root_right; + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. <newkey> carries the key being inserted. + */ + newkey = new->key; + + while (1) { + if (eb_gettag(troot) == EB_LEAF) { + /* insert above a leaf */ + old = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + new->node.node_p = old->node.leaf_p; + up_ptr = &old->node.leaf_p; + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct eb32_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. + */ + + if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */ + (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) { + /* The tree did not contain the key, so we insert <new> before the node + * <old>, and set ->bit to designate the lowest bit position in <new> + * which applies to ->branches.b[]. + */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + break; + } + + /* walk down */ + root = &old->node.branches; + side = (newkey >> old_node_bit) & EB_NODE_BRANCH_MASK; + troot = root->b[side]; + } + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + + /* We need the common higher bits between new->key and old->key. + * What differences are there between new->key and the node here ? + * NOTE that bit(new) is always < bit(root) because highest + * bit of new->key and old->key are identical here (otherwise they + * would sit on different branches). + */ + + // note that if EB_NODE_BITS > 1, we should check that it's still >= 0 + new->node.bit = flsnz(new->key ^ old->key) - EB_NODE_BITS; + + if (new->key == old->key) { + new->node.bit = -1; /* mark as new dup tree, just in case */ + + if (likely(eb_gettag(root_right))) { + /* we refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + return old; + } + + if (eb_gettag(troot) != EB_LEAF) { + /* there was already a dup tree below */ + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct eb32_node, node); + } + /* otherwise fall through */ + } + + if (new->key >= old->key) { + new->node.branches.b[EB_LEFT] = troot; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.leaf_p = new_rght; + *up_ptr = new_left; + } + else { + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = troot; + new->node.leaf_p = new_left; + *up_ptr = new_rght; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + +/* Insert eb32_node <new> into subtree starting at node root <root>, using + * signed keys. Only new->key needs be set with the key. The eb32_node + * is returned. If root->b[EB_RGHT]==1, the tree may only contain unique keys. + */ +static forceinline struct eb32_node * +__eb32i_insert(struct eb_root *root, struct eb32_node *new) { + struct eb32_node *old; + unsigned int side; + eb_troot_t *troot, **up_ptr; + int newkey; /* caching the key saves approximately one cycle */ + eb_troot_t *root_right; + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. <newkey> carries a high bit shift of the key being + * inserted in order to have negative keys stored before positive + * ones. + */ + newkey = new->key + 0x80000000; + + while (1) { + if (eb_gettag(troot) == EB_LEAF) { + old = container_of(eb_untag(troot, EB_LEAF), + struct eb32_node, node.branches); + new->node.node_p = old->node.leaf_p; + up_ptr = &old->node.leaf_p; + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct eb32_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. + */ + + if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */ + (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) { + /* The tree did not contain the key, so we insert <new> before the node + * <old>, and set ->bit to designate the lowest bit position in <new> + * which applies to ->branches.b[]. + */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + break; + } + + /* walk down */ + root = &old->node.branches; + side = (newkey >> old_node_bit) & EB_NODE_BRANCH_MASK; + troot = root->b[side]; + } + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + + /* We need the common higher bits between new->key and old->key. + * What differences are there between new->key and the node here ? + * NOTE that bit(new) is always < bit(root) because highest + * bit of new->key and old->key are identical here (otherwise they + * would sit on different branches). + */ + + // note that if EB_NODE_BITS > 1, we should check that it's still >= 0 + new->node.bit = flsnz(new->key ^ old->key) - EB_NODE_BITS; + + if (new->key == old->key) { + new->node.bit = -1; /* mark as new dup tree, just in case */ + + if (likely(eb_gettag(root_right))) { + /* we refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + return old; + } + + if (eb_gettag(troot) != EB_LEAF) { + /* there was already a dup tree below */ + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct eb32_node, node); + } + /* otherwise fall through */ + } + + if ((s32)new->key >= (s32)old->key) { + new->node.branches.b[EB_LEFT] = troot; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.leaf_p = new_rght; + *up_ptr = new_left; + } + else { + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = troot; + new->node.leaf_p = new_left; + *up_ptr = new_rght; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + +#endif /* _EB32_TREE_H */ diff --git a/include/import/eb64tree.h b/include/import/eb64tree.h new file mode 100644 index 0000000..d6e5db4 --- /dev/null +++ b/include/import/eb64tree.h @@ -0,0 +1,575 @@ +/* + * Elastic Binary Trees - macros and structures for operations on 64bit nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EB64TREE_H +#define _EB64TREE_H + +#include "ebtree.h" + + +/* Return the structure of type <type> whose member <member> points to <ptr> */ +#define eb64_entry(ptr, type, member) container_of(ptr, type, member) + +/* + * Exported functions and macros. + * Many of them are always inlined because they are extremely small, and + * are generally called at most once or twice in a program. + */ + +/* Return leftmost node in the tree, or NULL if none */ +static inline struct eb64_node *eb64_first(struct eb_root *root) +{ + return eb64_entry(eb_first(root), struct eb64_node, node); +} + +/* Return rightmost node in the tree, or NULL if none */ +static inline struct eb64_node *eb64_last(struct eb_root *root) +{ + return eb64_entry(eb_last(root), struct eb64_node, node); +} + +/* Return next node in the tree, or NULL if none */ +static inline struct eb64_node *eb64_next(struct eb64_node *eb64) +{ + return eb64_entry(eb_next(&eb64->node), struct eb64_node, node); +} + +/* Return previous node in the tree, or NULL if none */ +static inline struct eb64_node *eb64_prev(struct eb64_node *eb64) +{ + return eb64_entry(eb_prev(&eb64->node), struct eb64_node, node); +} + +/* Return next leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct eb64_node *eb64_next_dup(struct eb64_node *eb64) +{ + return eb64_entry(eb_next_dup(&eb64->node), struct eb64_node, node); +} + +/* Return previous leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct eb64_node *eb64_prev_dup(struct eb64_node *eb64) +{ + return eb64_entry(eb_prev_dup(&eb64->node), struct eb64_node, node); +} + +/* Return next node in the tree, skipping duplicates, or NULL if none */ +static inline struct eb64_node *eb64_next_unique(struct eb64_node *eb64) +{ + return eb64_entry(eb_next_unique(&eb64->node), struct eb64_node, node); +} + +/* Return previous node in the tree, skipping duplicates, or NULL if none */ +static inline struct eb64_node *eb64_prev_unique(struct eb64_node *eb64) +{ + return eb64_entry(eb_prev_unique(&eb64->node), struct eb64_node, node); +} + +/* Delete node from the tree if it was linked in. Mark the node unused. Note + * that this function relies on a non-inlined generic function: eb_delete. + */ +static inline void eb64_delete(struct eb64_node *eb64) +{ + eb_delete(&eb64->node); +} + +/* + * The following functions are not inlined by default. They are declared + * in eb64tree.c, which simply relies on their inline version. + */ +struct eb64_node *eb64_lookup(struct eb_root *root, u64 x); +struct eb64_node *eb64i_lookup(struct eb_root *root, s64 x); +struct eb64_node *eb64_lookup_le(struct eb_root *root, u64 x); +struct eb64_node *eb64_lookup_ge(struct eb_root *root, u64 x); +struct eb64_node *eb64_insert(struct eb_root *root, struct eb64_node *new); +struct eb64_node *eb64i_insert(struct eb_root *root, struct eb64_node *new); + +/* + * The following functions are less likely to be used directly, because their + * code is larger. The non-inlined version is preferred. + */ + +/* Delete node from the tree if it was linked in. Mark the node unused. */ +static forceinline void __eb64_delete(struct eb64_node *eb64) +{ + __eb_delete(&eb64->node); +} + +/* + * Find the first occurrence of a key in the tree <root>. If none can be + * found, return NULL. + */ +static forceinline struct eb64_node *__eb64_lookup(struct eb_root *root, u64 x) +{ + struct eb64_node *node; + eb_troot_t *troot; + u64 y; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + if (node->key == x) + return node; + else + return NULL; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + + y = node->key ^ x; + if (!y) { + /* Either we found the node which holds the key, or + * we have a dup tree. In the later case, we have to + * walk it down left to get the first entry. + */ + if (node->node.bit < 0) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + } + return node; + } + + if ((y >> node->node.bit) >= EB_NODE_BRANCHES) + return NULL; /* no more common bits */ + + troot = node->node.branches.b[(x >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } +} + +/* + * Find the first occurrence of a signed key in the tree <root>. If none can + * be found, return NULL. + */ +static forceinline struct eb64_node *__eb64i_lookup(struct eb_root *root, s64 x) +{ + struct eb64_node *node; + eb_troot_t *troot; + u64 key = x ^ (1ULL << 63); + u64 y; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + if (node->key == (u64)x) + return node; + else + return NULL; + } + node = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + + y = node->key ^ x; + if (!y) { + /* Either we found the node which holds the key, or + * we have a dup tree. In the later case, we have to + * walk it down left to get the first entry. + */ + if (node->node.bit < 0) { + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + } + return node; + } + + if ((y >> node->node.bit) >= EB_NODE_BRANCHES) + return NULL; /* no more common bits */ + + troot = node->node.branches.b[(key >> node->node.bit) & EB_NODE_BRANCH_MASK]; + } +} + +/* Insert eb64_node <new> into subtree starting at node root <root>. + * Only new->key needs be set with the key. The eb64_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. + */ +static forceinline struct eb64_node * +__eb64_insert(struct eb_root *root, struct eb64_node *new) { + struct eb64_node *old; + unsigned int side; + eb_troot_t *troot; + u64 newkey; /* caching the key saves approximately one cycle */ + eb_troot_t *root_right; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. <newkey> carries the key being inserted. + */ + newkey = new->key; + + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_leaf; + + old = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_leaf = eb_dotag(&old->node.branches, EB_LEAF); + + new->node.node_p = old->node.leaf_p; + + /* Right here, we have 3 possibilities : + - the tree does not contain the key, and we have + new->key < old->key. We insert new above old, on + the left ; + + - the tree does not contain the key, and we have + new->key > old->key. We insert new above old, on + the right ; + + - the tree does contain the key, which implies it + is alone. We add the new key next to it as a + first duplicate. + + The last two cases can easily be partially merged. + */ + + if (new->key < old->key) { + new->node.leaf_p = new_left; + old->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_leaf; + } else { + /* we may refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + if ((new->key == old->key) && eb_gettag(root_right)) + return old; + + /* new->key >= old->key, new goes the right */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + + if (new->key == old->key) { + new->node.bit = -1; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; + } + } + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. + */ + + if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */ + (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) { + /* The tree did not contain the key, so we insert <new> before the node + * <old>, and set ->bit to designate the lowest bit position in <new> + * which applies to ->branches.b[]. + */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_node; + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_node = eb_dotag(&old->node.branches, EB_NODE); + + new->node.node_p = old->node.node_p; + + if (new->key < old->key) { + new->node.leaf_p = new_left; + old->node.node_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_node; + } + else if (new->key > old->key) { + old->node.node_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_node; + new->node.branches.b[EB_RGHT] = new_leaf; + } + else { + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct eb64_node, node); + } + break; + } + + /* walk down */ + root = &old->node.branches; + + if (sizeof(long) >= 8) { + side = newkey >> old_node_bit; + } else { + /* note: provides the best code on low-register count archs + * such as i386. + */ + side = newkey; + side >>= old_node_bit; + if (old_node_bit >= 32) { + side = newkey >> 32; + side >>= old_node_bit & 0x1F; + } + } + side &= EB_NODE_BRANCH_MASK; + troot = root->b[side]; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + /* We need the common higher bits between new->key and old->key. + * What differences are there between new->key and the node here ? + * NOTE that bit(new) is always < bit(root) because highest + * bit of new->key and old->key are identical here (otherwise they + * would sit on different branches). + */ + // note that if EB_NODE_BITS > 1, we should check that it's still >= 0 + new->node.bit = fls64(new->key ^ old->key) - EB_NODE_BITS; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + + return new; +} + +/* Insert eb64_node <new> into subtree starting at node root <root>, using + * signed keys. Only new->key needs be set with the key. The eb64_node + * is returned. If root->b[EB_RGHT]==1, the tree may only contain unique keys. + */ +static forceinline struct eb64_node * +__eb64i_insert(struct eb_root *root, struct eb64_node *new) { + struct eb64_node *old; + unsigned int side; + eb_troot_t *troot; + u64 newkey; /* caching the key saves approximately one cycle */ + eb_troot_t *root_right; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. <newkey> carries a high bit shift of the key being + * inserted in order to have negative keys stored before positive + * ones. + */ + newkey = new->key ^ (1ULL << 63); + + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_leaf; + + old = container_of(eb_untag(troot, EB_LEAF), + struct eb64_node, node.branches); + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_leaf = eb_dotag(&old->node.branches, EB_LEAF); + + new->node.node_p = old->node.leaf_p; + + /* Right here, we have 3 possibilities : + - the tree does not contain the key, and we have + new->key < old->key. We insert new above old, on + the left ; + + - the tree does not contain the key, and we have + new->key > old->key. We insert new above old, on + the right ; + + - the tree does contain the key, which implies it + is alone. We add the new key next to it as a + first duplicate. + + The last two cases can easily be partially merged. + */ + + if ((s64)new->key < (s64)old->key) { + new->node.leaf_p = new_left; + old->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_leaf; + } else { + /* we may refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + if ((new->key == old->key) && eb_gettag(root_right)) + return old; + + /* new->key >= old->key, new goes the right */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + + if (new->key == old->key) { + new->node.bit = -1; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; + } + } + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct eb64_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. + */ + + if ((old_node_bit < 0) || /* we're above a duplicate tree, stop here */ + (((new->key ^ old->key) >> old_node_bit) >= EB_NODE_BRANCHES)) { + /* The tree did not contain the key, so we insert <new> before the node + * <old>, and set ->bit to designate the lowest bit position in <new> + * which applies to ->branches.b[]. + */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_node; + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_node = eb_dotag(&old->node.branches, EB_NODE); + + new->node.node_p = old->node.node_p; + + if ((s64)new->key < (s64)old->key) { + new->node.leaf_p = new_left; + old->node.node_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_node; + } + else if ((s64)new->key > (s64)old->key) { + old->node.node_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_node; + new->node.branches.b[EB_RGHT] = new_leaf; + } + else { + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct eb64_node, node); + } + break; + } + + /* walk down */ + root = &old->node.branches; + + if (sizeof(long) >= 8) { + side = newkey >> old_node_bit; + } else { + /* note: provides the best code on low-register count archs + * such as i386. + */ + side = newkey; + side >>= old_node_bit; + if (old_node_bit >= 32) { + side = newkey >> 32; + side >>= old_node_bit & 0x1F; + } + } + side &= EB_NODE_BRANCH_MASK; + troot = root->b[side]; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + /* We need the common higher bits between new->key and old->key. + * What differences are there between new->key and the node here ? + * NOTE that bit(new) is always < bit(root) because highest + * bit of new->key and old->key are identical here (otherwise they + * would sit on different branches). + */ + // note that if EB_NODE_BITS > 1, we should check that it's still >= 0 + new->node.bit = fls64(new->key ^ old->key) - EB_NODE_BITS; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + + return new; +} + +#endif /* _EB64_TREE_H */ diff --git a/include/import/ebimtree.h b/include/import/ebimtree.h new file mode 100644 index 0000000..0afbdd1 --- /dev/null +++ b/include/import/ebimtree.h @@ -0,0 +1,324 @@ +/* + * Elastic Binary Trees - macros for Indirect Multi-Byte data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EBIMTREE_H +#define _EBIMTREE_H + +#include <string.h> +#include "ebtree.h" +#include "ebpttree.h" + +/* These functions and macros rely on Pointer nodes and use the <key> entry as + * a pointer to an indirect key. Most operations are performed using ebpt_*. + */ + +/* The following functions are not inlined by default. They are declared + * in ebimtree.c, which simply relies on their inline version. + */ +struct ebpt_node *ebim_lookup(struct eb_root *root, const void *x, unsigned int len); +struct ebpt_node *ebim_insert(struct eb_root *root, struct ebpt_node *new, unsigned int len); + +/* Find the first occurrence of a key of a least <len> bytes matching <x> in the + * tree <root>. The caller is responsible for ensuring that <len> will not exceed + * the common parts between the tree's keys and <x>. In case of multiple matches, + * the leftmost node is returned. This means that this function can be used to + * lookup string keys by prefix if all keys in the tree are zero-terminated. If + * no match is found, NULL is returned. Returns first node if <len> is zero. + */ +static forceinline struct ebpt_node * +__ebim_lookup(struct eb_root *root, const void *x, unsigned int len) +{ + struct ebpt_node *node; + eb_troot_t *troot; + int pos, side; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + goto ret_null; + + if (unlikely(len == 0)) + goto walk_down; + + pos = 0; + while (1) { + if (eb_gettag(troot) == EB_LEAF) { + node = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + if (eb_memcmp(node->key + pos, x, len) != 0) + goto ret_null; + else + goto ret_node; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebpt_node, node.branches); + + node_bit = node->node.bit; + if (node_bit < 0) { + /* We have a dup tree now. Either it's for the same + * value, and we walk down left, or it's a different + * one and we don't have our key. + */ + if (eb_memcmp(node->key + pos, x, len) != 0) + goto ret_null; + else + goto walk_left; + } + + /* OK, normal data node, let's walk down. We check if all full + * bytes are equal, and we start from the last one we did not + * completely check. We stop as soon as we reach the last byte, + * because we must decide to go left/right or abort. + */ + node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit) + if (node_bit < 0) { + /* This surprising construction gives better performance + * because gcc does not try to reorder the loop. Tested to + * be fine with 2.95 to 4.2. + */ + while (1) { + if (*(unsigned char*)(node->key + pos++) ^ *(unsigned char*)(x++)) + goto ret_null; /* more than one full byte is different */ + if (--len == 0) + goto walk_left; /* return first node if all bytes matched */ + node_bit += 8; + if (node_bit >= 0) + break; + } + } + + /* here we know that only the last byte differs, so node_bit < 8. + * We have 2 possibilities : + * - more than the last bit differs => return NULL + * - walk down on side = (x[pos] >> node_bit) & 1 + */ + side = *(unsigned char *)x >> node_bit; + if (((*(unsigned char*)(node->key + pos) >> node_bit) ^ side) > 1) + goto ret_null; + side &= 1; + troot = node->node.branches.b[side]; + } + walk_left: + troot = node->node.branches.b[EB_LEFT]; + walk_down: + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + ret_node: + return node; + ret_null: + return NULL; +} + +/* Insert ebpt_node <new> into subtree starting at node root <root>. + * Only new->key needs be set with the key. The ebpt_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * len is specified in bytes. + */ +static forceinline struct ebpt_node * +__ebim_insert(struct eb_root *root, struct ebpt_node *new, unsigned int len) +{ + struct ebpt_node *old; + unsigned int side; + eb_troot_t *troot; + eb_troot_t *root_right; + int diff; + int bit; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + len <<= 3; + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. + */ + + bit = 0; + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_leaf; + + old = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_leaf = eb_dotag(&old->node.branches, EB_LEAF); + + new->node.node_p = old->node.leaf_p; + + /* Right here, we have 3 possibilities : + * - the tree does not contain the key, and we have + * new->key < old->key. We insert new above old, on + * the left ; + * + * - the tree does not contain the key, and we have + * new->key > old->key. We insert new above old, on + * the right ; + * + * - the tree does contain the key, which implies it + * is alone. We add the new key next to it as a + * first duplicate. + * + * The last two cases can easily be partially merged. + */ + bit = equal_bits(new->key, old->key, bit, len); + + /* Note: we can compare more bits than the current node's because as + * long as they are identical, we know we descend along the correct + * side. However we don't want to start to compare past the end. + */ + diff = 0; + if (((unsigned)bit >> 3) < len) + diff = cmp_bits(new->key, old->key, bit); + + if (diff < 0) { + new->node.leaf_p = new_left; + old->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_leaf; + } else { + /* we may refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + if (diff == 0 && eb_gettag(root_right)) + return old; + + /* new->key >= old->key, new goes the right */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + + if (diff == 0) { + new->node.bit = -1; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; + } + } + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct ebpt_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. Note: we can compare more bits than + * the current node's because as long as they are identical, we + * know we descend along the correct side. + */ + if (old_node_bit < 0) { + /* we're above a duplicate tree, we must compare till the end */ + bit = equal_bits(new->key, old->key, bit, len); + goto dup_tree; + } + else if (bit < old_node_bit) { + bit = equal_bits(new->key, old->key, bit, old_node_bit); + } + + if (bit < old_node_bit) { /* we don't have all bits in common */ + /* The tree did not contain the key, so we insert <new> before the node + * <old>, and set ->bit to designate the lowest bit position in <new> + * which applies to ->branches.b[]. + */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_node; + + dup_tree: + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_node = eb_dotag(&old->node.branches, EB_NODE); + + new->node.node_p = old->node.node_p; + + /* Note: we can compare more bits than the current node's because as + * long as they are identical, we know we descend along the correct + * side. However we don't want to start to compare past the end. + */ + diff = 0; + if (((unsigned)bit >> 3) < len) + diff = cmp_bits(new->key, old->key, bit); + + if (diff < 0) { + new->node.leaf_p = new_left; + old->node.node_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_node; + } + else if (diff > 0) { + old->node.node_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_node; + new->node.branches.b[EB_RGHT] = new_leaf; + } + else { + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct ebpt_node, node); + } + break; + } + + /* walk down */ + root = &old->node.branches; + side = (((unsigned char *)new->key)[old_node_bit >> 3] >> (~old_node_bit & 7)) & 1; + troot = root->b[side]; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + /* We need the common higher bits between new->key and old->key. + * This number of bits is already in <bit>. + */ + new->node.bit = bit; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + +#endif /* _EBIMTREE_H */ diff --git a/include/import/ebistree.h b/include/import/ebistree.h new file mode 100644 index 0000000..a438fa1 --- /dev/null +++ b/include/import/ebistree.h @@ -0,0 +1,329 @@ +/* + * Elastic Binary Trees - macros to manipulate Indirect String data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* These functions and macros rely on Multi-Byte nodes */ + +#ifndef _EBISTREE_H +#define _EBISTREE_H + +#include <string.h> +#include "ebtree.h" +#include "ebpttree.h" +#include "ebimtree.h" + +/* These functions and macros rely on Pointer nodes and use the <key> entry as + * a pointer to an indirect key. Most operations are performed using ebpt_*. + */ + +/* The following functions are not inlined by default. They are declared + * in ebistree.c, which simply relies on their inline version. + */ +struct ebpt_node *ebis_lookup(struct eb_root *root, const char *x); +struct ebpt_node *ebis_insert(struct eb_root *root, struct ebpt_node *new); + +/* Find the first occurrence of a length <len> string <x> in the tree <root>. + * It's the caller's responsibility to use this function only on trees which + * only contain zero-terminated strings, and that no null character is present + * in string <x> in the first <len> chars. If none can be found, return NULL. + */ +static forceinline struct ebpt_node * +ebis_lookup_len(struct eb_root *root, const char *x, unsigned int len) +{ + struct ebpt_node *node; + + node = ebim_lookup(root, x, len); + if (!node || ((const char *)node->key)[len] != 0) + return NULL; + return node; +} + +/* Find the first occurrence of a zero-terminated string <x> in the tree <root>. + * It's the caller's responsibility to use this function only on trees which + * only contain zero-terminated strings. If none can be found, return NULL. + */ +static forceinline struct ebpt_node *__ebis_lookup(struct eb_root *root, const void *x) +{ + struct ebpt_node *node; + eb_troot_t *troot; + int bit; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + bit = 0; + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + if (strcmp(node->key, x) == 0) + return node; + else + return NULL; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebpt_node, node.branches); + node_bit = node->node.bit; + + if (node_bit < 0) { + /* We have a dup tree now. Either it's for the same + * value, and we walk down left, or it's a different + * one and we don't have our key. + */ + if (strcmp(node->key, x) != 0) + return NULL; + + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + return node; + } + + /* OK, normal data node, let's walk down but don't compare data + * if we already reached the end of the key. + */ + if (likely(bit >= 0)) { + bit = string_equal_bits(x, node->key, bit); + if (likely(bit < node_bit)) { + if (bit >= 0) + return NULL; /* no more common bits */ + + /* bit < 0 : we reached the end of the key. If we + * are in a tree with unique keys, we can return + * this node. Otherwise we have to walk it down + * and stop comparing bits. + */ + if (eb_gettag(root->b[EB_RGHT])) + return node; + } + /* if the bit is larger than the node's, we must bound it + * because we might have compared too many bytes with an + * inappropriate leaf. For a test, build a tree from "0", + * "WW", "W", "S" inserted in this exact sequence and lookup + * "W" => "S" is returned without this assignment. + */ + else + bit = node_bit; + } + + troot = node->node.branches.b[(((unsigned char*)x)[node_bit >> 3] >> + (~node_bit & 7)) & 1]; + } +} + +/* Insert ebpt_node <new> into subtree starting at node root <root>. Only + * new->key needs be set with the zero-terminated string key. The ebpt_node is + * returned. If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * caller is responsible for properly terminating the key with a zero. + */ +static forceinline struct ebpt_node * +__ebis_insert(struct eb_root *root, struct ebpt_node *new) +{ + struct ebpt_node *old; + unsigned int side; + eb_troot_t *troot; + eb_troot_t *root_right; + int diff; + int bit; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. + */ + + bit = 0; + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_leaf; + + old = container_of(eb_untag(troot, EB_LEAF), + struct ebpt_node, node.branches); + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_leaf = eb_dotag(&old->node.branches, EB_LEAF); + + new->node.node_p = old->node.leaf_p; + + /* Right here, we have 3 possibilities : + * - the tree does not contain the key, and we have + * new->key < old->key. We insert new above old, on + * the left ; + * + * - the tree does not contain the key, and we have + * new->key > old->key. We insert new above old, on + * the right ; + * + * - the tree does contain the key, which implies it + * is alone. We add the new key next to it as a + * first duplicate. + * + * The last two cases can easily be partially merged. + */ + if (bit >= 0) + bit = string_equal_bits(new->key, old->key, bit); + + if (bit < 0) { + /* key was already there */ + + /* we may refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + if (eb_gettag(root_right)) + return old; + + /* new arbitrarily goes to the right and tops the dup tree */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.bit = -1; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; + } + + diff = cmp_bits(new->key, old->key, bit); + if (diff < 0) { + /* new->key < old->key, new takes the left */ + new->node.leaf_p = new_left; + old->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_leaf; + } else { + /* new->key > old->key, new takes the right */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + } + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct ebpt_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. Note: we can compare more bits than + * the current node's because as long as they are identical, we + * know we descend along the correct side. + */ + if (bit >= 0 && (bit < old_node_bit || old_node_bit < 0)) + bit = string_equal_bits(new->key, old->key, bit); + + if (unlikely(bit < 0)) { + /* Perfect match, we must only stop on head of dup tree + * or walk down to a leaf. + */ + if (old_node_bit < 0) { + /* We know here that string_equal_bits matched all + * bits and that we're on top of a dup tree, then + * we can perform the dup insertion and return. + */ + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct ebpt_node, node); + } + /* OK so let's walk down */ + } + else if (bit < old_node_bit || old_node_bit < 0) { + /* The tree did not contain the key, or we stopped on top of a dup + * tree, possibly containing the key. In the former case, we insert + * <new> before the node <old>, and set ->bit to designate the lowest + * bit position in <new> which applies to ->branches.b[]. In the later + * case, we add the key to the existing dup tree. Note that we cannot + * enter here if we match an intermediate node's key that is not the + * head of a dup tree. + */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_node; + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_node = eb_dotag(&old->node.branches, EB_NODE); + + new->node.node_p = old->node.node_p; + + /* we can never match all bits here */ + diff = cmp_bits(new->key, old->key, bit); + if (diff < 0) { + new->node.leaf_p = new_left; + old->node.node_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_node; + } + else { + old->node.node_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_node; + new->node.branches.b[EB_RGHT] = new_leaf; + } + break; + } + + /* walk down */ + root = &old->node.branches; + side = (((unsigned char *)new->key)[old_node_bit >> 3] >> (~old_node_bit & 7)) & 1; + troot = root->b[side]; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + /* We need the common higher bits between new->key and old->key. + * This number of bits is already in <bit>. + * NOTE: we can't get here whit bit < 0 since we found a dup ! + */ + new->node.bit = bit; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + +#endif /* _EBISTREE_H */ diff --git a/include/import/ebmbtree.h b/include/import/ebmbtree.h new file mode 100644 index 0000000..365042e --- /dev/null +++ b/include/import/ebmbtree.h @@ -0,0 +1,850 @@ +/* + * Elastic Binary Trees - macros and structures for Multi-Byte data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EBMBTREE_H +#define _EBMBTREE_H + +#include <string.h> +#include "ebtree.h" + +/* Return the structure of type <type> whose member <member> points to <ptr> */ +#define ebmb_entry(ptr, type, member) container_of(ptr, type, member) + +/* + * Exported functions and macros. + * Many of them are always inlined because they are extremely small, and + * are generally called at most once or twice in a program. + */ + +/* Return leftmost node in the tree, or NULL if none */ +static forceinline struct ebmb_node *ebmb_first(struct eb_root *root) +{ + return ebmb_entry(eb_first(root), struct ebmb_node, node); +} + +/* Return rightmost node in the tree, or NULL if none */ +static forceinline struct ebmb_node *ebmb_last(struct eb_root *root) +{ + return ebmb_entry(eb_last(root), struct ebmb_node, node); +} + +/* Return next node in the tree, or NULL if none */ +static forceinline struct ebmb_node *ebmb_next(struct ebmb_node *ebmb) +{ + return ebmb_entry(eb_next(&ebmb->node), struct ebmb_node, node); +} + +/* Return previous node in the tree, or NULL if none */ +static forceinline struct ebmb_node *ebmb_prev(struct ebmb_node *ebmb) +{ + return ebmb_entry(eb_prev(&ebmb->node), struct ebmb_node, node); +} + +/* Return next leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct ebmb_node *ebmb_next_dup(struct ebmb_node *ebmb) +{ + return ebmb_entry(eb_next_dup(&ebmb->node), struct ebmb_node, node); +} + +/* Return previous leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct ebmb_node *ebmb_prev_dup(struct ebmb_node *ebmb) +{ + return ebmb_entry(eb_prev_dup(&ebmb->node), struct ebmb_node, node); +} + +/* Return next node in the tree, skipping duplicates, or NULL if none */ +static forceinline struct ebmb_node *ebmb_next_unique(struct ebmb_node *ebmb) +{ + return ebmb_entry(eb_next_unique(&ebmb->node), struct ebmb_node, node); +} + +/* Return previous node in the tree, skipping duplicates, or NULL if none */ +static forceinline struct ebmb_node *ebmb_prev_unique(struct ebmb_node *ebmb) +{ + return ebmb_entry(eb_prev_unique(&ebmb->node), struct ebmb_node, node); +} + +/* Delete node from the tree if it was linked in. Mark the node unused. Note + * that this function relies on a non-inlined generic function: eb_delete. + */ +static forceinline void ebmb_delete(struct ebmb_node *ebmb) +{ + eb_delete(&ebmb->node); +} + +/* The following functions are not inlined by default. They are declared + * in ebmbtree.c, which simply relies on their inline version. + */ +struct ebmb_node *ebmb_lookup(struct eb_root *root, const void *x, unsigned int len); +struct ebmb_node *ebmb_insert(struct eb_root *root, struct ebmb_node *new, unsigned int len); +struct ebmb_node *ebmb_lookup_longest(struct eb_root *root, const void *x); +struct ebmb_node *ebmb_lookup_prefix(struct eb_root *root, const void *x, unsigned int pfx); +struct ebmb_node *ebmb_insert_prefix(struct eb_root *root, struct ebmb_node *new, unsigned int len); + +/* start from a valid leaf and find the next matching prefix that's either a + * duplicate, or immediately shorter than the node's current one and still + * matches it. The purpose is to permit a caller that is not satisfied with a + * result provided by ebmb_lookup_longest() to evaluate the next matching + * entry. Given that shorter keys are necessarily attached to nodes located + * above the current one, it's sufficient to restart from the current leaf and + * go up until we find a shorter prefix, or a non-matching one. + */ +static inline struct ebmb_node *ebmb_lookup_shorter(struct ebmb_node *start) +{ + eb_troot_t *t = start->node.leaf_p; + struct ebmb_node *node; + + /* first, check for duplicates */ + node = ebmb_next_dup(start); + if (node) + return node; + + while (1) { + if (eb_gettag(t) == EB_LEFT) { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(t, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + node = container_of(eb_root_to_node(eb_untag(t, EB_LEFT)), struct ebmb_node, node); + } else { + /* Walking up from right branch, so we cannot be below + * root. However, if we end up on a node with an even + * and positive bit, this is a cover node, which mandates + * that the left branch only contains cover values, so we + * must descend it. + */ + node = container_of(eb_root_to_node(eb_untag(t, EB_RGHT)), struct ebmb_node, node); + if (node->node.bit > 0 && !(node->node.bit & 1)) + return ebmb_entry(eb_walk_down(t, EB_LEFT), struct ebmb_node, node); + } + + /* Note that <t> cannot be NULL at this stage */ + t = node->node.node_p; + + /* this is a node attached to a deeper (and possibly different) + * leaf, not interesting for us. + */ + if (node->node.pfx >= start->node.pfx) + continue; + + if (check_bits(start->key, node->key, 0, node->node.pfx) == 0) + break; + } + return node; +} + +/* The following functions are less likely to be used directly, because their + * code is larger. The non-inlined version is preferred. + */ + +/* Delete node from the tree if it was linked in. Mark the node unused. */ +static forceinline void __ebmb_delete(struct ebmb_node *ebmb) +{ + __eb_delete(&ebmb->node); +} + +/* Find the first occurrence of a key of a least <len> bytes matching <x> in the + * tree <root>. The caller is responsible for ensuring that <len> will not exceed + * the common parts between the tree's keys and <x>. In case of multiple matches, + * the leftmost node is returned. This means that this function can be used to + * lookup string keys by prefix if all keys in the tree are zero-terminated. If + * no match is found, NULL is returned. Returns first node if <len> is zero. + */ +static forceinline struct ebmb_node *__ebmb_lookup(struct eb_root *root, const void *x, unsigned int len) +{ + struct ebmb_node *node; + eb_troot_t *troot; + int pos, side; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + goto ret_null; + + if (unlikely(len == 0)) + goto walk_down; + + pos = 0; + while (1) { + if (eb_gettag(troot) == EB_LEAF) { + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + if (eb_memcmp(node->key + pos, x, len) != 0) + goto ret_null; + else + goto ret_node; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + + node_bit = node->node.bit; + if (node_bit < 0) { + /* We have a dup tree now. Either it's for the same + * value, and we walk down left, or it's a different + * one and we don't have our key. + */ + if (eb_memcmp(node->key + pos, x, len) != 0) + goto ret_null; + else + goto walk_left; + } + + /* OK, normal data node, let's walk down. We check if all full + * bytes are equal, and we start from the last one we did not + * completely check. We stop as soon as we reach the last byte, + * because we must decide to go left/right or abort. + */ + node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit) + if (node_bit < 0) { + /* This surprising construction gives better performance + * because gcc does not try to reorder the loop. Tested to + * be fine with 2.95 to 4.2. + */ + while (1) { + if (node->key[pos++] ^ *(unsigned char*)(x++)) + goto ret_null; /* more than one full byte is different */ + if (--len == 0) + goto walk_left; /* return first node if all bytes matched */ + node_bit += 8; + if (node_bit >= 0) + break; + } + } + + /* here we know that only the last byte differs, so node_bit < 8. + * We have 2 possibilities : + * - more than the last bit differs => return NULL + * - walk down on side = (x[pos] >> node_bit) & 1 + */ + side = *(unsigned char *)x >> node_bit; + if (((node->key[pos] >> node_bit) ^ side) > 1) + goto ret_null; + side &= 1; + troot = node->node.branches.b[side]; + } + walk_left: + troot = node->node.branches.b[EB_LEFT]; + walk_down: + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + ret_node: + return node; + ret_null: + return NULL; +} + +/* Insert ebmb_node <new> into subtree starting at node root <root>. + * Only new->key needs be set with the key. The ebmb_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * len is specified in bytes. It is absolutely mandatory that this length + * is the same for all keys in the tree. This function cannot be used to + * insert strings. + */ +static forceinline struct ebmb_node * +__ebmb_insert(struct eb_root *root, struct ebmb_node *new, unsigned int len) +{ + struct ebmb_node *old; + unsigned int side; + eb_troot_t *troot, **up_ptr; + eb_troot_t *root_right; + int diff; + int bit; + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. + */ + + bit = 0; + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + /* insert above a leaf */ + old = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + new->node.node_p = old->node.leaf_p; + up_ptr = &old->node.leaf_p; + goto check_bit_and_break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + old_node_bit = old->node.bit; + + if (unlikely(old->node.bit < 0)) { + /* We're above a duplicate tree, so we must compare the whole value */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + check_bit_and_break: + bit = equal_bits(new->key, old->key, bit, len << 3); + break; + } + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. Note: we can compare more bits than + * the current node's because as long as they are identical, we + * know we descend along the correct side. + */ + + bit = equal_bits(new->key, old->key, bit, old_node_bit); + if (unlikely(bit < old_node_bit)) { + /* The tree did not contain the key, so we insert <new> before the + * node <old>, and set ->bit to designate the lowest bit position in + * <new> which applies to ->branches.b[]. + */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + break; + } + /* we don't want to skip bits for further comparisons, so we must limit <bit>. + * However, since we're going down around <old_node_bit>, we know it will be + * properly matched, so we can skip this bit. + */ + bit = old_node_bit + 1; + + /* walk down */ + root = &old->node.branches; + side = old_node_bit & 7; + side ^= 7; + side = (new->key[old_node_bit >> 3] >> side) & 1; + troot = root->b[side]; + } + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + + new->node.bit = bit; + + /* Note: we can compare more bits than the current node's because as + * long as they are identical, we know we descend along the correct + * side. However we don't want to start to compare past the end. + */ + diff = 0; + if (((unsigned)bit >> 3) < len) + diff = cmp_bits(new->key, old->key, bit); + + if (diff == 0) { + new->node.bit = -1; /* mark as new dup tree, just in case */ + + if (likely(eb_gettag(root_right))) { + /* we refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + return old; + } + + if (eb_gettag(troot) != EB_LEAF) { + /* there was already a dup tree below */ + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct ebmb_node, node); + } + /* otherwise fall through */ + } + + if (diff >= 0) { + new->node.branches.b[EB_LEFT] = troot; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.leaf_p = new_rght; + *up_ptr = new_left; + } + else { + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = troot; + new->node.leaf_p = new_left; + *up_ptr = new_rght; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + + +/* Find the first occurrence of the longest prefix matching a key <x> in the + * tree <root>. It's the caller's responsibility to ensure that key <x> is at + * least as long as the keys in the tree. Note that this can be ensured by + * having a byte at the end of <x> which cannot be part of any prefix, typically + * the trailing zero for a string. If none can be found, return NULL. + */ +static forceinline struct ebmb_node *__ebmb_lookup_longest(struct eb_root *root, const void *x) +{ + struct ebmb_node *node; + eb_troot_t *troot, *cover; + int pos, side; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + cover = NULL; + pos = 0; + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + if (check_bits(x - pos, node->key, pos, node->node.pfx)) + goto not_found; + + return node; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + + node_bit = node->node.bit; + if (node_bit < 0) { + /* We have a dup tree now. Either it's for the same + * value, and we walk down left, or it's a different + * one and we don't have our key. + */ + if (check_bits(x - pos, node->key, pos, node->node.pfx)) + goto not_found; + + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + return node; + } + + node_bit >>= 1; /* strip cover bit */ + node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit) + if (node_bit < 0) { + /* This uncommon construction gives better performance + * because gcc does not try to reorder the loop. Tested to + * be fine with 2.95 to 4.2. + */ + while (1) { + x++; pos++; + if (node->key[pos-1] ^ *(unsigned char*)(x-1)) + goto not_found; /* more than one full byte is different */ + node_bit += 8; + if (node_bit >= 0) + break; + } + } + + /* here we know that only the last byte differs, so 0 <= node_bit <= 7. + * We have 2 possibilities : + * - more than the last bit differs => data does not match + * - walk down on side = (x[pos] >> node_bit) & 1 + */ + side = *(unsigned char *)x >> node_bit; + if (((node->key[pos] >> node_bit) ^ side) > 1) + goto not_found; + + if (!(node->node.bit & 1)) { + /* This is a cover node, let's keep a reference to it + * for later. The covering subtree is on the left, and + * the covered subtree is on the right, so we have to + * walk down right. + */ + cover = node->node.branches.b[EB_LEFT]; + troot = node->node.branches.b[EB_RGHT]; + continue; + } + side &= 1; + troot = node->node.branches.b[side]; + } + + not_found: + /* Walk down last cover tree if it exists. It does not matter if cover is NULL */ + return ebmb_entry(eb_walk_down(cover, EB_LEFT), struct ebmb_node, node); +} + + +/* Find the first occurrence of a prefix matching a key <x> of <pfx> BITS in the + * tree <root>. It's the caller's responsibility to ensure that key <x> is at + * least as long as the keys in the tree. Note that this can be ensured by + * having a byte at the end of <x> which cannot be part of any prefix, typically + * the trailing zero for a string. If none can be found, return NULL. + */ +static forceinline struct ebmb_node *__ebmb_lookup_prefix(struct eb_root *root, const void *x, unsigned int pfx) +{ + struct ebmb_node *node; + eb_troot_t *troot; + int pos, side; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + pos = 0; + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + if (node->node.pfx != pfx) + return NULL; + if (check_bits(x - pos, node->key, pos, node->node.pfx)) + return NULL; + return node; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + + node_bit = node->node.bit; + if (node_bit < 0) { + /* We have a dup tree now. Either it's for the same + * value, and we walk down left, or it's a different + * one and we don't have our key. + */ + if (node->node.pfx != pfx) + return NULL; + if (check_bits(x - pos, node->key, pos, node->node.pfx)) + return NULL; + + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + return node; + } + + node_bit >>= 1; /* strip cover bit */ + node_bit = ~node_bit + (pos << 3) + 8; // = (pos<<3) + (7 - node_bit) + if (node_bit < 0) { + /* This uncommon construction gives better performance + * because gcc does not try to reorder the loop. Tested to + * be fine with 2.95 to 4.2. + */ + while (1) { + x++; pos++; + if (node->key[pos-1] ^ *(unsigned char*)(x-1)) + return NULL; /* more than one full byte is different */ + node_bit += 8; + if (node_bit >= 0) + break; + } + } + + /* here we know that only the last byte differs, so 0 <= node_bit <= 7. + * We have 2 possibilities : + * - more than the last bit differs => data does not match + * - walk down on side = (x[pos] >> node_bit) & 1 + */ + side = *(unsigned char *)x >> node_bit; + if (((node->key[pos] >> node_bit) ^ side) > 1) + return NULL; + + if (!(node->node.bit & 1)) { + /* This is a cover node, it may be the entry we're + * looking for. We already know that it matches all the + * bits, let's compare prefixes and descend the cover + * subtree if they match. + */ + if ((unsigned short)node->node.bit >> 1 == pfx) + troot = node->node.branches.b[EB_LEFT]; + else + troot = node->node.branches.b[EB_RGHT]; + continue; + } + side &= 1; + troot = node->node.branches.b[side]; + } +} + + +/* Insert ebmb_node <new> into a prefix subtree starting at node root <root>. + * Only new->key and new->pfx need be set with the key and its prefix length. + * Note that bits between <pfx> and <len> are theoretically ignored and should be + * zero, as it is not certain yet that they will always be ignored everywhere + * (eg in bit compare functions). + * The ebmb_node is returned. + * If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * len is specified in bytes. + */ +static forceinline struct ebmb_node * +__ebmb_insert_prefix(struct eb_root *root, struct ebmb_node *new, unsigned int len) +{ + struct ebmb_node *old; + unsigned int side; + eb_troot_t *troot, **up_ptr; + eb_troot_t *root_right; + int diff; + int bit; + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf; + int old_node_bit; + unsigned int npfx = new->node.pfx; + unsigned int npfx1 = npfx << 1; + const unsigned char *nkey = new->key; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + len <<= 3; + if (len > npfx) + len = npfx; + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. + */ + + bit = 0; + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + /* Insert above a leaf. Note that this leaf could very + * well be part of a cover node. + */ + old = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + new->node.node_p = old->node.leaf_p; + up_ptr = &old->node.leaf_p; + goto check_bit_and_break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + old_node_bit = old->node.bit; + /* Note that old_node_bit can be : + * < 0 : dup tree + * = 2N : cover node for N bits + * = 2N+1 : normal node at N bits + */ + + if (unlikely(old_node_bit < 0)) { + /* We're above a duplicate tree, so we must compare the whole value */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + check_bit_and_break: + /* No need to compare everything if the leaves are shorter than the new one. */ + if (len > old->node.pfx) + len = old->node.pfx; + bit = equal_bits(nkey, old->key, bit, len); + break; + } + + /* WARNING: for the two blocks below, <bit> is counted in half-bits */ + + bit = equal_bits(nkey, old->key, bit, old_node_bit >> 1); + bit = (bit << 1) + 1; // assume comparisons with normal nodes + + /* we must always check that our prefix is larger than the nodes + * we visit, otherwise we have to stop going down. The following + * test is able to stop before both normal and cover nodes. + */ + if (bit >= npfx1 && npfx1 < old_node_bit) { + /* insert cover node here on the left */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + new->node.bit = npfx1; + diff = -1; + goto insert_above; + } + + if (unlikely(bit < old_node_bit)) { + /* The tree did not contain the key, so we insert <new> before the + * node <old>, and set ->bit to designate the lowest bit position in + * <new> which applies to ->branches.b[]. We know that the bit is not + * greater than the prefix length thanks to the test above. + */ + new->node.node_p = old->node.node_p; + up_ptr = &old->node.node_p; + new->node.bit = bit; + diff = cmp_bits(nkey, old->key, bit >> 1); + goto insert_above; + } + + if (!(old_node_bit & 1)) { + /* if we encounter a cover node with our exact prefix length, it's + * necessarily the same value, so we insert there as a duplicate on + * the left. For that, we go down on the left and the leaf detection + * code will finish the job. + */ + if (npfx1 == old_node_bit) { + root = &old->node.branches; + side = EB_LEFT; + troot = root->b[side]; + continue; + } + + /* cover nodes are always walked through on the right */ + side = EB_RGHT; + bit = old_node_bit >> 1; /* recheck that bit */ + root = &old->node.branches; + troot = root->b[side]; + continue; + } + + /* we don't want to skip bits for further comparisons, so we must limit <bit>. + * However, since we're going down around <old_node_bit>, we know it will be + * properly matched, so we can skip this bit. + */ + old_node_bit >>= 1; + bit = old_node_bit + 1; + + /* walk down */ + root = &old->node.branches; + side = old_node_bit & 7; + side ^= 7; + side = (nkey[old_node_bit >> 3] >> side) & 1; + troot = root->b[side]; + } + + /* Right here, we have 4 possibilities : + * - the tree does not contain any leaf matching the + * key, and we have new->key < old->key. We insert + * new above old, on the left ; + * + * - the tree does not contain any leaf matching the + * key, and we have new->key > old->key. We insert + * new above old, on the right ; + * + * - the tree does contain the key with the same prefix + * length. We add the new key next to it as a first + * duplicate (since it was alone). + * + * The last two cases can easily be partially merged. + * + * - the tree contains a leaf matching the key, we have + * to insert above it as a cover node. The leaf with + * the shortest prefix becomes the left subtree and + * the leaf with the longest prefix becomes the right + * one. The cover node gets the min of both prefixes + * as its new bit. + */ + + /* first we want to ensure that we compare the correct bit, which means + * the largest common to both nodes. + */ + if (bit > npfx) + bit = npfx; + if (bit > old->node.pfx) + bit = old->node.pfx; + + new->node.bit = (bit << 1) + 1; /* assume normal node by default */ + + /* if one prefix is included in the second one, we don't compare bits + * because they won't necessarily match, we just proceed with a cover + * node insertion. + */ + diff = 0; + if (bit < old->node.pfx && bit < npfx) + diff = cmp_bits(nkey, old->key, bit); + + if (diff == 0) { + /* Both keys match. Either it's a duplicate entry or we have to + * put the shortest prefix left and the largest one right below + * a new cover node. By default, diff==0 means we'll be inserted + * on the right. + */ + new->node.bit--; /* anticipate cover node insertion */ + if (npfx == old->node.pfx) { + new->node.bit = -1; /* mark as new dup tree, just in case */ + + if (unlikely(eb_gettag(root_right))) { + /* we refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + return old; + } + + if (eb_gettag(troot) != EB_LEAF) { + /* there was already a dup tree below */ + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct ebmb_node, node); + } + /* otherwise fall through to insert first duplicate */ + } + /* otherwise we just rely on the tests below to select the right side */ + else if (npfx < old->node.pfx) + diff = -1; /* force insertion to left side */ + } + + insert_above: + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + + if (diff >= 0) { + new->node.branches.b[EB_LEFT] = troot; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.leaf_p = new_rght; + *up_ptr = new_left; + } + else { + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = troot; + new->node.leaf_p = new_left; + *up_ptr = new_rght; + } + + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + + + +#endif /* _EBMBTREE_H */ + diff --git a/include/import/ebpttree.h b/include/import/ebpttree.h new file mode 100644 index 0000000..64816a2 --- /dev/null +++ b/include/import/ebpttree.h @@ -0,0 +1,156 @@ +/* + * Elastic Binary Trees - macros and structures for operations on pointer nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EBPTTREE_H +#define _EBPTTREE_H + +#include "ebtree.h" +#include "eb32tree.h" +#include "eb64tree.h" + + +/* Return the structure of type <type> whose member <member> points to <ptr> */ +#define ebpt_entry(ptr, type, member) container_of(ptr, type, member) + +/* + * Exported functions and macros. + * Many of them are always inlined because they are extremely small, and + * are generally called at most once or twice in a program. + */ + +/* Return leftmost node in the tree, or NULL if none */ +static forceinline struct ebpt_node *ebpt_first(struct eb_root *root) +{ + return ebpt_entry(eb_first(root), struct ebpt_node, node); +} + +/* Return rightmost node in the tree, or NULL if none */ +static forceinline struct ebpt_node *ebpt_last(struct eb_root *root) +{ + return ebpt_entry(eb_last(root), struct ebpt_node, node); +} + +/* Return next node in the tree, or NULL if none */ +static forceinline struct ebpt_node *ebpt_next(struct ebpt_node *ebpt) +{ + return ebpt_entry(eb_next(&ebpt->node), struct ebpt_node, node); +} + +/* Return previous node in the tree, or NULL if none */ +static forceinline struct ebpt_node *ebpt_prev(struct ebpt_node *ebpt) +{ + return ebpt_entry(eb_prev(&ebpt->node), struct ebpt_node, node); +} + +/* Return next leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct ebpt_node *ebpt_next_dup(struct ebpt_node *ebpt) +{ + return ebpt_entry(eb_next_dup(&ebpt->node), struct ebpt_node, node); +} + +/* Return previous leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct ebpt_node *ebpt_prev_dup(struct ebpt_node *ebpt) +{ + return ebpt_entry(eb_prev_dup(&ebpt->node), struct ebpt_node, node); +} + +/* Return next node in the tree, skipping duplicates, or NULL if none */ +static forceinline struct ebpt_node *ebpt_next_unique(struct ebpt_node *ebpt) +{ + return ebpt_entry(eb_next_unique(&ebpt->node), struct ebpt_node, node); +} + +/* Return previous node in the tree, skipping duplicates, or NULL if none */ +static forceinline struct ebpt_node *ebpt_prev_unique(struct ebpt_node *ebpt) +{ + return ebpt_entry(eb_prev_unique(&ebpt->node), struct ebpt_node, node); +} + +/* Delete node from the tree if it was linked in. Mark the node unused. Note + * that this function relies on a non-inlined generic function: eb_delete. + */ +static forceinline void ebpt_delete(struct ebpt_node *ebpt) +{ + eb_delete(&ebpt->node); +} + +/* + * The following functions are inlined but derived from the integer versions. + */ +static forceinline struct ebpt_node *ebpt_lookup(struct eb_root *root, void *x) +{ + if (sizeof(void *) == 4) + return (struct ebpt_node *)eb32_lookup(root, (u32)(PTR_INT_TYPE)x); + else + return (struct ebpt_node *)eb64_lookup(root, (u64)(PTR_INT_TYPE)x); +} + +static forceinline struct ebpt_node *ebpt_lookup_le(struct eb_root *root, void *x) +{ + if (sizeof(void *) == 4) + return (struct ebpt_node *)eb32_lookup_le(root, (u32)(PTR_INT_TYPE)x); + else + return (struct ebpt_node *)eb64_lookup_le(root, (u64)(PTR_INT_TYPE)x); +} + +static forceinline struct ebpt_node *ebpt_lookup_ge(struct eb_root *root, void *x) +{ + if (sizeof(void *) == 4) + return (struct ebpt_node *)eb32_lookup_ge(root, (u32)(PTR_INT_TYPE)x); + else + return (struct ebpt_node *)eb64_lookup_ge(root, (u64)(PTR_INT_TYPE)x); +} + +static forceinline struct ebpt_node *ebpt_insert(struct eb_root *root, struct ebpt_node *new) +{ + if (sizeof(void *) == 4) + return (struct ebpt_node *)eb32_insert(root, (struct eb32_node *)new); + else + return (struct ebpt_node *)eb64_insert(root, (struct eb64_node *)new); +} + +/* + * The following functions are less likely to be used directly, because + * their code is larger. The non-inlined version is preferred. + */ + +/* Delete node from the tree if it was linked in. Mark the node unused. */ +static forceinline void __ebpt_delete(struct ebpt_node *ebpt) +{ + __eb_delete(&ebpt->node); +} + +static forceinline struct ebpt_node *__ebpt_lookup(struct eb_root *root, void *x) +{ + if (sizeof(void *) == 4) + return (struct ebpt_node *)__eb32_lookup(root, (u32)(PTR_INT_TYPE)x); + else + return (struct ebpt_node *)__eb64_lookup(root, (u64)(PTR_INT_TYPE)x); +} + +static forceinline struct ebpt_node *__ebpt_insert(struct eb_root *root, struct ebpt_node *new) +{ + if (sizeof(void *) == 4) + return (struct ebpt_node *)__eb32_insert(root, (struct eb32_node *)new); + else + return (struct ebpt_node *)__eb64_insert(root, (struct eb64_node *)new); +} + +#endif /* _EBPT_TREE_H */ diff --git a/include/import/ebsttree.h b/include/import/ebsttree.h new file mode 100644 index 0000000..db2267b --- /dev/null +++ b/include/import/ebsttree.h @@ -0,0 +1,324 @@ +/* + * Elastic Binary Trees - macros to manipulate String data nodes. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* These functions and macros rely on Multi-Byte nodes */ + +#ifndef _EBSTTREE_H +#define _EBSTTREE_H + +#include "ebtree.h" +#include "ebmbtree.h" + +/* The following functions are not inlined by default. They are declared + * in ebsttree.c, which simply relies on their inline version. + */ +struct ebmb_node *ebst_lookup(struct eb_root *root, const char *x); +struct ebmb_node *ebst_insert(struct eb_root *root, struct ebmb_node *new); + +/* Find the first occurrence of a length <len> string <x> in the tree <root>. + * It's the caller's responsibility to use this function only on trees which + * only contain zero-terminated strings, and that no null character is present + * in string <x> in the first <len> chars. If none can be found, return NULL. + */ +static forceinline struct ebmb_node * +ebst_lookup_len(struct eb_root *root, const char *x, unsigned int len) +{ + struct ebmb_node *node; + + node = ebmb_lookup(root, x, len); + if (!node || node->key[len] != 0) + return NULL; + return node; +} + +/* Find the first occurrence of a zero-terminated string <x> in the tree <root>. + * It's the caller's responsibility to use this function only on trees which + * only contain zero-terminated strings. If none can be found, return NULL. + */ +static forceinline struct ebmb_node *__ebst_lookup(struct eb_root *root, const void *x) +{ + struct ebmb_node *node; + eb_troot_t *troot; + int bit; + int node_bit; + + troot = root->b[EB_LEFT]; + if (unlikely(troot == NULL)) + return NULL; + + bit = 0; + while (1) { + if ((eb_gettag(troot) == EB_LEAF)) { + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + if (strcmp((char *)node->key, x) == 0) + return node; + else + return NULL; + } + node = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + node_bit = node->node.bit; + + if (node_bit < 0) { + /* We have a dup tree now. Either it's for the same + * value, and we walk down left, or it's a different + * one and we don't have our key. + */ + if (strcmp((char *)node->key, x) != 0) + return NULL; + + troot = node->node.branches.b[EB_LEFT]; + while (eb_gettag(troot) != EB_LEAF) + troot = (eb_untag(troot, EB_NODE))->b[EB_LEFT]; + node = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + return node; + } + + /* OK, normal data node, let's walk down but don't compare data + * if we already reached the end of the key. + */ + if (likely(bit >= 0)) { + bit = string_equal_bits(x, node->key, bit); + if (likely(bit < node_bit)) { + if (bit >= 0) + return NULL; /* no more common bits */ + + /* bit < 0 : we reached the end of the key. If we + * are in a tree with unique keys, we can return + * this node. Otherwise we have to walk it down + * and stop comparing bits. + */ + if (eb_gettag(root->b[EB_RGHT])) + return node; + } + /* if the bit is larger than the node's, we must bound it + * because we might have compared too many bytes with an + * inappropriate leaf. For a test, build a tree from "0", + * "WW", "W", "S" inserted in this exact sequence and lookup + * "W" => "S" is returned without this assignment. + */ + else + bit = node_bit; + } + + troot = node->node.branches.b[(((unsigned char*)x)[node_bit >> 3] >> + (~node_bit & 7)) & 1]; + } +} + +/* Insert ebmb_node <new> into subtree starting at node root <root>. Only + * new->key needs be set with the zero-terminated string key. The ebmb_node is + * returned. If root->b[EB_RGHT]==1, the tree may only contain unique keys. The + * caller is responsible for properly terminating the key with a zero. + */ +static forceinline struct ebmb_node * +__ebst_insert(struct eb_root *root, struct ebmb_node *new) +{ + struct ebmb_node *old; + unsigned int side; + eb_troot_t *troot; + eb_troot_t *root_right; + int diff; + int bit; + int old_node_bit; + + side = EB_LEFT; + troot = root->b[EB_LEFT]; + root_right = root->b[EB_RGHT]; + if (unlikely(troot == NULL)) { + /* Tree is empty, insert the leaf part below the left branch */ + root->b[EB_LEFT] = eb_dotag(&new->node.branches, EB_LEAF); + new->node.leaf_p = eb_dotag(root, EB_LEFT); + new->node.node_p = NULL; /* node part unused */ + return new; + } + + /* The tree descent is fairly easy : + * - first, check if we have reached a leaf node + * - second, check if we have gone too far + * - third, reiterate + * Everywhere, we use <new> for the node node we are inserting, <root> + * for the node we attach it to, and <old> for the node we are + * displacing below <new>. <troot> will always point to the future node + * (tagged with its type). <side> carries the side the node <new> is + * attached to below its parent, which is also where previous node + * was attached. + */ + + bit = 0; + while (1) { + if (unlikely(eb_gettag(troot) == EB_LEAF)) { + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_leaf; + + old = container_of(eb_untag(troot, EB_LEAF), + struct ebmb_node, node.branches); + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_leaf = eb_dotag(&old->node.branches, EB_LEAF); + + new->node.node_p = old->node.leaf_p; + + /* Right here, we have 3 possibilities : + * - the tree does not contain the key, and we have + * new->key < old->key. We insert new above old, on + * the left ; + * + * - the tree does not contain the key, and we have + * new->key > old->key. We insert new above old, on + * the right ; + * + * - the tree does contain the key, which implies it + * is alone. We add the new key next to it as a + * first duplicate. + * + * The last two cases can easily be partially merged. + */ + if (bit >= 0) + bit = string_equal_bits(new->key, old->key, bit); + + if (bit < 0) { + /* key was already there */ + + /* we may refuse to duplicate this key if the tree is + * tagged as containing only unique keys. + */ + if (eb_gettag(root_right)) + return old; + + /* new arbitrarily goes to the right and tops the dup tree */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + new->node.bit = -1; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; + } + + diff = cmp_bits(new->key, old->key, bit); + if (diff < 0) { + /* new->key < old->key, new takes the left */ + new->node.leaf_p = new_left; + old->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_leaf; + } else { + /* new->key > old->key, new takes the right */ + old->node.leaf_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_leaf; + new->node.branches.b[EB_RGHT] = new_leaf; + } + break; + } + + /* OK we're walking down this link */ + old = container_of(eb_untag(troot, EB_NODE), + struct ebmb_node, node.branches); + old_node_bit = old->node.bit; + + /* Stop going down when we don't have common bits anymore. We + * also stop in front of a duplicates tree because it means we + * have to insert above. Note: we can compare more bits than + * the current node's because as long as they are identical, we + * know we descend along the correct side. + */ + if (bit >= 0 && (bit < old_node_bit || old_node_bit < 0)) + bit = string_equal_bits(new->key, old->key, bit); + + if (unlikely(bit < 0)) { + /* Perfect match, we must only stop on head of dup tree + * or walk down to a leaf. + */ + if (old_node_bit < 0) { + /* We know here that string_equal_bits matched all + * bits and that we're on top of a dup tree, then + * we can perform the dup insertion and return. + */ + struct eb_node *ret; + ret = eb_insert_dup(&old->node, &new->node); + return container_of(ret, struct ebmb_node, node); + } + /* OK so let's walk down */ + } + else if (bit < old_node_bit || old_node_bit < 0) { + /* The tree did not contain the key, or we stopped on top of a dup + * tree, possibly containing the key. In the former case, we insert + * <new> before the node <old>, and set ->bit to designate the lowest + * bit position in <new> which applies to ->branches.b[]. In the later + * case, we add the key to the existing dup tree. Note that we cannot + * enter here if we match an intermediate node's key that is not the + * head of a dup tree. + */ + eb_troot_t *new_left, *new_rght; + eb_troot_t *new_leaf, *old_node; + + new_left = eb_dotag(&new->node.branches, EB_LEFT); + new_rght = eb_dotag(&new->node.branches, EB_RGHT); + new_leaf = eb_dotag(&new->node.branches, EB_LEAF); + old_node = eb_dotag(&old->node.branches, EB_NODE); + + new->node.node_p = old->node.node_p; + + /* we can never match all bits here */ + diff = cmp_bits(new->key, old->key, bit); + if (diff < 0) { + new->node.leaf_p = new_left; + old->node.node_p = new_rght; + new->node.branches.b[EB_LEFT] = new_leaf; + new->node.branches.b[EB_RGHT] = old_node; + } + else { + old->node.node_p = new_left; + new->node.leaf_p = new_rght; + new->node.branches.b[EB_LEFT] = old_node; + new->node.branches.b[EB_RGHT] = new_leaf; + } + break; + } + + /* walk down */ + root = &old->node.branches; + side = (new->key[old_node_bit >> 3] >> (~old_node_bit & 7)) & 1; + troot = root->b[side]; + } + + /* Ok, now we are inserting <new> between <root> and <old>. <old>'s + * parent is already set to <new>, and the <root>'s branch is still in + * <side>. Update the root's leaf till we have it. Note that we can also + * find the side by checking the side of new->node.node_p. + */ + + /* We need the common higher bits between new->key and old->key. + * This number of bits is already in <bit>. + * NOTE: we can't get here whit bit < 0 since we found a dup ! + */ + new->node.bit = bit; + root->b[side] = eb_dotag(&new->node.branches, EB_NODE); + return new; +} + +#endif /* _EBSTTREE_H */ + diff --git a/include/import/ebtree-t.h b/include/import/ebtree-t.h new file mode 100644 index 0000000..b695426 --- /dev/null +++ b/include/import/ebtree-t.h @@ -0,0 +1,217 @@ +/* + * Elastic Binary Trees - types + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef _EBTREE_T_H +#define _EBTREE_T_H + +#include <haproxy/api-t.h> + +/* + * generic types for ebtree + */ + +/* Number of bits per node, and number of leaves per node */ +#define EB_NODE_BITS 1 +#define EB_NODE_BRANCHES (1 << EB_NODE_BITS) +#define EB_NODE_BRANCH_MASK (EB_NODE_BRANCHES - 1) + +/* Be careful not to tweak those values. The walking code is optimized for NULL + * detection on the assumption that the following values are intact. + */ +#define EB_LEFT 0 +#define EB_RGHT 1 +#define EB_LEAF 0 +#define EB_NODE 1 + +/* Tags to set in root->b[EB_RGHT] : + * - EB_NORMAL is a normal tree which stores duplicate keys. + * - EB_UNIQUE is a tree which stores unique keys. + */ +#define EB_NORMAL 0 +#define EB_UNIQUE 1 + +/* This is the same as an eb_node pointer, except that the lower bit embeds + * a tag. See eb_dotag()/eb_untag()/eb_gettag(). This tag has two meanings : + * - 0=left, 1=right to designate the parent's branch for leaf_p/node_p + * - 0=link, 1=leaf to designate the branch's type for branch[] + */ +typedef void eb_troot_t; + +/* The eb_root connects the node which contains it, to two nodes below it, one + * of which may be the same node. At the top of the tree, we use an eb_root + * too, which always has its right branch NULL (+/1 low-order bits). + */ +struct eb_root { + eb_troot_t *b[EB_NODE_BRANCHES]; /* left and right branches */ +}; + +/* The eb_node contains the two parts, one for the leaf, which always exists, + * and one for the node, which remains unused in the very first node inserted + * into the tree. This structure is 20 bytes per node on 32-bit machines. Do + * not change the order, benchmarks have shown that it's optimal this way. + * Note: be careful about this struct's alignment if it gets included into + * another struct and some atomic ops are expected on the keys or the node. + */ +struct eb_node { + struct eb_root branches; /* branches, must be at the beginning */ + eb_troot_t *node_p; /* link node's parent */ + eb_troot_t *leaf_p; /* leaf node's parent */ + short int bit; /* link's bit position. */ + short unsigned int pfx; /* data prefix length, always related to leaf */ +} __attribute__((packed)); + + +/* The root of a tree is an eb_root initialized with both pointers NULL. + * During its life, only the left pointer will change. The right one will + * always remain NULL, which is the way we detect it. + */ +#define EB_ROOT \ + (struct eb_root) { \ + .b = {[0] = NULL, [1] = NULL }, \ + } + +#define EB_ROOT_UNIQUE \ + (struct eb_root) { \ + .b = {[0] = NULL, [1] = (void *)1 }, \ + } + +#define EB_TREE_HEAD(name) \ + struct eb_root name = EB_ROOT + + +/* + * types for eb32tree + */ + +#define EB32_ROOT EB_ROOT +#define EB32_TREE_HEAD EB_TREE_HEAD + +/* These types may sometimes already be defined */ +typedef unsigned int u32; +typedef signed int s32; + +/* This structure carries a node, a leaf, and a key. It must start with the + * eb_node so that it can be cast into an eb_node. We could also have put some + * sort of transparent union here to reduce the indirection level, but the fact + * is, the end user is not meant to manipulate internals, so this is pointless. + */ +struct eb32_node { + struct eb_node node; /* the tree node, must be at the beginning */ + MAYBE_ALIGN(sizeof(u32)); + u32 key; +} ALIGNED(sizeof(void*)); + +/* This structure carries a node, a leaf, a scope, and a key. It must start + * with the eb_node so that it can be cast into an eb_node. We could also + * have put some sort of transparent union here to reduce the indirection + * level, but the fact is, the end user is not meant to manipulate internals, + * so this is pointless. + * In case sizeof(void*)>=sizeof(long), we know there will be some padding after + * the leaf if it's unaligned. In this case we force the alignment on void* so + * that we prefer to have the padding before for more efficient accesses. + */ +struct eb32sc_node { + struct eb_node node; /* the tree node, must be at the beginning */ + MAYBE_ALIGN(sizeof(u32)); + u32 key; + ALWAYS_ALIGN(sizeof(void*)); + unsigned long node_s; /* visibility of this node's branches */ + unsigned long leaf_s; /* visibility of this node's leaf */ +} ALIGNED(sizeof(void*)); + +/* + * types for eb64tree + */ + +#define EB64_ROOT EB_ROOT +#define EB64_TREE_HEAD EB_TREE_HEAD + +/* These types may sometimes already be defined */ +typedef unsigned long long u64; +typedef signed long long s64; + +/* This structure carries a node, a leaf, and a key. It must start with the + * eb_node so that it can be cast into an eb_node. We could also have put some + * sort of transparent union here to reduce the indirection level, but the fact + * is, the end user is not meant to manipulate internals, so this is pointless. + * In case sizeof(void*)>=sizeof(u64), we know there will be some padding after + * the key if it's unaligned. In this case we force the alignment on void* so + * that we prefer to have the padding before for more efficient accesses. + */ +struct eb64_node { + struct eb_node node; /* the tree node, must be at the beginning */ + MAYBE_ALIGN(sizeof(u64)); + ALWAYS_ALIGN(sizeof(void*)); + u64 key; +} ALIGNED(sizeof(void*)); + +#define EBPT_ROOT EB_ROOT +#define EBPT_TREE_HEAD EB_TREE_HEAD + +/* on *almost* all platforms, a pointer can be cast into a size_t which is unsigned */ +#ifndef PTR_INT_TYPE +#define PTR_INT_TYPE size_t +#endif + +/* + * types for ebpttree + */ + +typedef PTR_INT_TYPE ptr_t; + +/* This structure carries a node, a leaf, and a key. It must start with the + * eb_node so that it can be cast into an eb_node. We could also have put some + * sort of transparent union here to reduce the indirection level, but the fact + * is, the end user is not meant to manipulate internals, so this is pointless. + * Internally, it is automatically cast as an eb32_node or eb64_node. + * We always align the key since the struct itself will be padded to the same + * size anyway. + */ +struct ebpt_node { + struct eb_node node; /* the tree node, must be at the beginning */ + ALWAYS_ALIGN(sizeof(void*)); + void *key; +} ALIGNED(sizeof(void*)); + +/* + * types for ebmbtree + */ + +#define EBMB_ROOT EB_ROOT +#define EBMB_TREE_HEAD EB_TREE_HEAD + +/* This structure carries a node, a leaf, and a key. It must start with the + * eb_node so that it can be cast into an eb_node. We could also have put some + * sort of transparent union here to reduce the indirection level, but the fact + * is, the end user is not meant to manipulate internals, so this is pointless. + * The 'node.bit' value here works differently from scalar types, as it contains + * the number of identical bits between the two branches. + * Note that we take a great care of making sure the key is located exactly at + * the end of the struct even if that involves holes before it, so that it + * always aliases any external key a user would append after. This is why the + * key uses the same alignment as the struct. + */ +struct ebmb_node { + struct eb_node node; /* the tree node, must be at the beginning */ + ALWAYS_ALIGN(sizeof(void*)); + unsigned char key[0]; /* the key, its size depends on the application */ +} ALIGNED(sizeof(void*)); + +#endif /* _EB_TREE_T_H */ diff --git a/include/import/ebtree.h b/include/import/ebtree.h new file mode 100644 index 0000000..d6e51d5 --- /dev/null +++ b/include/import/ebtree.h @@ -0,0 +1,857 @@ +/* + * Elastic Binary Trees - generic macros and structures. + * Version 6.0.6 + * (C) 2002-2011 - Willy Tarreau <w@1wt.eu> + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + + + +/* + General idea: + ------------- + In a radix binary tree, we may have up to 2N-1 nodes for N keys if all of + them are leaves. If we find a way to differentiate intermediate nodes (later + called "nodes") and final nodes (later called "leaves"), and we associate + them by two, it is possible to build sort of a self-contained radix tree with + intermediate nodes always present. It will not be as cheap as the ultree for + optimal cases as shown below, but the optimal case almost never happens : + + Eg, to store 8, 10, 12, 13, 14 : + + ultree this theoretical tree + + 8 8 + / \ / \ + 10 12 10 12 + / \ / \ + 13 14 12 14 + / \ + 12 13 + + Note that on real-world tests (with a scheduler), is was verified that the + case with data on an intermediate node never happens. This is because the + data spectrum is too large for such coincidences to happen. It would require + for instance that a task has its expiration time at an exact second, with + other tasks sharing that second. This is too rare to try to optimize for it. + + What is interesting is that the node will only be added above the leaf when + necessary, which implies that it will always remain somewhere above it. So + both the leaf and the node can share the exact value of the leaf, because + when going down the node, the bit mask will be applied to comparisons. So we + are tempted to have one single key shared between the node and the leaf. + + The bit only serves the nodes, and the dups only serve the leaves. So we can + put a lot of information in common. This results in one single entity with + two branch pointers and two parent pointers, one for the node part, and one + for the leaf part : + + node's leaf's + parent parent + | | + [node] [leaf] + / \ + left right + branch branch + + The node may very well refer to its leaf counterpart in one of its branches, + indicating that its own leaf is just below it : + + node's + parent + | + [node] + / \ + left [leaf] + branch + + Adding keys in such a tree simply consists in inserting nodes between + other nodes and/or leaves : + + [root] + | + [node2] + / \ + [leaf1] [node3] + / \ + [leaf2] [leaf3] + + On this diagram, we notice that [node2] and [leaf2] have been pulled away + from each other due to the insertion of [node3], just as if there would be + an elastic between both parts. This elastic-like behaviour gave its name to + the tree : "Elastic Binary Tree", or "EBtree". The entity which associates a + node part and a leaf part will be called an "EB node". + + We also notice on the diagram that there is a root entity required to attach + the tree. It only contains two branches and there is nothing above it. This + is an "EB root". Some will note that [leaf1] has no [node1]. One property of + the EBtree is that all nodes have their branches filled, and that if a node + has only one branch, it does not need to exist. Here, [leaf1] was added + below [root] and did not need any node. + + An EB node contains : + - a pointer to the node's parent (node_p) + - a pointer to the leaf's parent (leaf_p) + - two branches pointing to lower nodes or leaves (branches) + - a bit position (bit) + - an optional key. + + The key here is optional because it's used only during insertion, in order + to classify the nodes. Nothing else in the tree structure requires knowledge + of the key. This makes it possible to write type-agnostic primitives for + everything, and type-specific insertion primitives. This has led to consider + two types of EB nodes. The type-agnostic ones will serve as a header for the + other ones, and will simply be called "struct eb_node". The other ones will + have their type indicated in the structure name. Eg: "struct eb32_node" for + nodes carrying 32 bit keys. + + We will also node that the two branches in a node serve exactly the same + purpose as an EB root. For this reason, a "struct eb_root" will be used as + well inside the struct eb_node. In order to ease pointer manipulation and + ROOT detection when walking upwards, all the pointers inside an eb_node will + point to the eb_root part of the referenced EB nodes, relying on the same + principle as the linked lists in Linux. + + Another important point to note, is that when walking inside a tree, it is + very convenient to know where a node is attached in its parent, and what + type of branch it has below it (leaf or node). In order to simplify the + operations and to speed up the processing, it was decided in this specific + implementation to use the lowest bit from the pointer to designate the side + of the upper pointers (left/right) and the type of a branch (leaf/node). + This practise is not mandatory by design, but an implementation-specific + optimisation permitted on all platforms on which data must be aligned. All + known 32 bit platforms align their integers and pointers to 32 bits, leaving + the two lower bits unused. So, we say that the pointers are "tagged". And + since they designate pointers to root parts, we simply call them + "tagged root pointers", or "eb_troot" in the code. + + Duplicate keys are stored in a special manner. When inserting a key, if + the same one is found, then an incremental binary tree is built at this + place from these keys. This ensures that no special case has to be written + to handle duplicates when walking through the tree or when deleting entries. + It also guarantees that duplicates will be walked in the exact same order + they were inserted. This is very important when trying to achieve fair + processing distribution for instance. + + Algorithmic complexity can be derived from 3 variables : + - the number of possible different keys in the tree : P + - the number of entries in the tree : N + - the number of duplicates for one key : D + + Note that this tree is deliberately NOT balanced. For this reason, the worst + case may happen with a small tree (eg: 32 distinct keys of one bit). BUT, + the operations required to manage such data are so much cheap that they make + it worth using it even under such conditions. For instance, a balanced tree + may require only 6 levels to store those 32 keys when this tree will + require 32. But if per-level operations are 5 times cheaper, it wins. + + Minimal, Maximal and Average times are specified in number of operations. + Minimal is given for best condition, Maximal for worst condition, and the + average is reported for a tree containing random keys. An operation + generally consists in jumping from one node to the other. + + Complexity : + - lookup : min=1, max=log(P), avg=log(N) + - insertion from root : min=1, max=log(P), avg=log(N) + - insertion of dups : min=1, max=log(D), avg=log(D)/2 after lookup + - deletion : min=1, max=1, avg=1 + - prev/next : min=1, max=log(P), avg=2 : + N/2 nodes need 1 hop => 1*N/2 + N/4 nodes need 2 hops => 2*N/4 + N/8 nodes need 3 hops => 3*N/8 + ... + N/x nodes need log(x) hops => log2(x)*N/x + Total cost for all N nodes : sum[i=1..N](log2(i)*N/i) = N*sum[i=1..N](log2(i)/i) + Average cost across N nodes = total / N = sum[i=1..N](log2(i)/i) = 2 + + This design is currently limited to only two branches per node. Most of the + tree descent algorithm would be compatible with more branches (eg: 4, to cut + the height in half), but this would probably require more complex operations + and the deletion algorithm would be problematic. + + Useful properties : + - a node is always added above the leaf it is tied to, and never can get + below nor in another branch. This implies that leaves directly attached + to the root do not use their node part, which is indicated by a NULL + value in node_p. This also enhances the cache efficiency when walking + down the tree, because when the leaf is reached, its node part will + already have been visited (unless it's the first leaf in the tree). + + - pointers to lower nodes or leaves are stored in "branch" pointers. Only + the root node may have a NULL in either branch, it is not possible for + other branches. Since the nodes are attached to the left branch of the + root, it is not possible to see a NULL left branch when walking up a + tree. Thus, an empty tree is immediately identified by a NULL left + branch at the root. Conversely, the one and only way to identify the + root node is to check that it right branch is NULL. Note that the + NULL pointer may have a few low-order bits set. + + - a node connected to its own leaf will have branch[0|1] pointing to + itself, and leaf_p pointing to itself. + + - a node can never have node_p pointing to itself. + + - a node is linked in a tree if and only if it has a non-null leaf_p. + + - a node can never have both branches equal, except for the root which can + have them both NULL. + + - deletion only applies to leaves. When a leaf is deleted, its parent must + be released too (unless it's the root), and its sibling must attach to + the grand-parent, replacing the parent. Also, when a leaf is deleted, + the node tied to this leaf will be removed and must be released too. If + this node is different from the leaf's parent, the freshly released + leaf's parent will be used to replace the node which must go. A released + node will never be used anymore, so there's no point in tracking it. + + - the bit index in a node indicates the bit position in the key which is + represented by the branches. That means that a node with (bit == 0) is + just above two leaves. Negative bit values are used to build a duplicate + tree. The first node above two identical leaves gets (bit == -1). This + value logarithmically decreases as the duplicate tree grows. During + duplicate insertion, a node is inserted above the highest bit value (the + lowest absolute value) in the tree during the right-sided walk. If bit + -1 is not encountered (highest < -1), we insert above last leaf. + Otherwise, we insert above the node with the highest value which was not + equal to the one of its parent + 1. + + - the "eb_next" primitive walks from left to right, which means from lower + to higher keys. It returns duplicates in the order they were inserted. + The "eb_first" primitive returns the left-most entry. + + - the "eb_prev" primitive walks from right to left, which means from + higher to lower keys. It returns duplicates in the opposite order they + were inserted. The "eb_last" primitive returns the right-most entry. + + - a tree which has 1 in the lower bit of its root's right branch is a + tree with unique nodes. This means that when a node is inserted with + a key which already exists will not be inserted, and the previous + entry will be returned. + + */ + +#ifndef _EBTREE_H +#define _EBTREE_H + +#include <stdlib.h> +#include <import/ebtree-t.h> +#include <haproxy/api.h> + +static inline int flsnz8_generic(unsigned int x) +{ + int ret = 0; + if (x >> 4) { x >>= 4; ret += 4; } + return ret + ((0xFFFFAA50U >> (x << 1)) & 3) + 1; +} + +/* Note: we never need to run fls on null keys, so we can optimize the fls + * function by removing a conditional jump. + */ +#if defined(__i386__) || defined(__x86_64__) +/* this code is similar on 32 and 64 bit */ +static inline int flsnz(int x) +{ + int r; + __asm__("bsrl %1,%0\n" + : "=r" (r) : "rm" (x)); + return r+1; +} + +static inline int flsnz8(unsigned char x) +{ + int r; + __asm__("movzbl %%al, %%eax\n" + "bsrl %%eax,%0\n" + : "=r" (r) : "a" (x)); + return r+1; +} + +#else +// returns 1 to 32 for 1<<0 to 1<<31. Undefined for 0. +#define flsnz(___a) ({ \ + register int ___x, ___bits = 0; \ + ___x = (___a); \ + if (___x & 0xffff0000) { ___x &= 0xffff0000; ___bits += 16;} \ + if (___x & 0xff00ff00) { ___x &= 0xff00ff00; ___bits += 8;} \ + if (___x & 0xf0f0f0f0) { ___x &= 0xf0f0f0f0; ___bits += 4;} \ + if (___x & 0xcccccccc) { ___x &= 0xcccccccc; ___bits += 2;} \ + if (___x & 0xaaaaaaaa) { ___x &= 0xaaaaaaaa; ___bits += 1;} \ + ___bits + 1; \ + }) + +static inline int flsnz8(unsigned int x) +{ + return flsnz8_generic(x); +} + + +#endif + +static inline int fls64(unsigned long long x) +{ + unsigned int h; + unsigned int bits = 32; + + h = x >> 32; + if (!h) { + h = x; + bits = 0; + } + return flsnz(h) + bits; +} + +#define fls_auto(x) ((sizeof(x) > 4) ? fls64(x) : flsnz(x)) + +/* Linux-like "container_of". It returns a pointer to the structure of type + * <type> which has its member <name> stored at address <ptr>. + */ +#ifndef container_of +#define container_of(ptr, type, name) ((type *)(((void *)(ptr)) - ((long)&((type *)0)->name))) +#endif + +/* returns a pointer to the structure of type <type> which has its member <name> + * stored at address <ptr>, unless <ptr> is 0, in which case 0 is returned. + */ +#ifndef container_of_safe +#define container_of_safe(ptr, type, name) \ + ({ void *__p = (ptr); \ + __p ? (type *)(__p - ((long)&((type *)0)->name)) : (type *)0; \ + }) +#endif + +/* Return the structure of type <type> whose member <member> points to <ptr> */ +#define eb_entry(ptr, type, member) container_of(ptr, type, member) + +/***************************************\ + * Private functions. Not for end-user * +\***************************************/ + +/* Converts a root pointer to its equivalent eb_troot_t pointer, + * ready to be stored in ->branch[], leaf_p or node_p. NULL is not + * conserved. To be used with EB_LEAF, EB_NODE, EB_LEFT or EB_RGHT in <tag>. + */ +static inline eb_troot_t *eb_dotag(const struct eb_root *root, const int tag) +{ + return (eb_troot_t *)((void *)root + tag); +} + +/* Converts an eb_troot_t pointer pointer to its equivalent eb_root pointer, + * for use with pointers from ->branch[], leaf_p or node_p. NULL is conserved + * as long as the tree is not corrupted. To be used with EB_LEAF, EB_NODE, + * EB_LEFT or EB_RGHT in <tag>. + */ +static inline struct eb_root *eb_untag(const eb_troot_t *troot, const int tag) +{ + return (struct eb_root *)((void *)troot - tag); +} + +/* returns the tag associated with an eb_troot_t pointer */ +static inline int eb_gettag(eb_troot_t *troot) +{ + return (unsigned long)troot & 1; +} + +/* Converts a root pointer to its equivalent eb_troot_t pointer and clears the + * tag, no matter what its value was. + */ +static inline struct eb_root *eb_clrtag(const eb_troot_t *troot) +{ + return (struct eb_root *)((unsigned long)troot & ~1UL); +} + +/* Returns a pointer to the eb_node holding <root> */ +static inline struct eb_node *eb_root_to_node(struct eb_root *root) +{ + return container_of(root, struct eb_node, branches); +} + +/* Walks down starting at root pointer <start>, and always walking on side + * <side>. It either returns the node hosting the first leaf on that side, + * or NULL if no leaf is found. <start> may either be NULL or a branch pointer. + * The pointer to the leaf (or NULL) is returned. + */ +static inline struct eb_node *eb_walk_down(eb_troot_t *start, unsigned int side) +{ + /* A NULL pointer on an empty tree root will be returned as-is */ + while (eb_gettag(start) == EB_NODE) + start = (eb_untag(start, EB_NODE))->b[side]; + /* NULL is left untouched (root==eb_node, EB_LEAF==0) */ + return eb_root_to_node(eb_untag(start, EB_LEAF)); +} + +/* This function is used to build a tree of duplicates by adding a new node to + * a subtree of at least 2 entries. It will probably never be needed inlined, + * and it is not for end-user. + */ +static forceinline struct eb_node * +__eb_insert_dup(struct eb_node *sub, struct eb_node *new) +{ + struct eb_node *head = sub; + + eb_troot_t *new_left = eb_dotag(&new->branches, EB_LEFT); + eb_troot_t *new_rght = eb_dotag(&new->branches, EB_RGHT); + eb_troot_t *new_leaf = eb_dotag(&new->branches, EB_LEAF); + + /* first, identify the deepest hole on the right branch */ + while (eb_gettag(head->branches.b[EB_RGHT]) != EB_LEAF) { + struct eb_node *last = head; + head = container_of(eb_untag(head->branches.b[EB_RGHT], EB_NODE), + struct eb_node, branches); + if (head->bit > last->bit + 1) + sub = head; /* there's a hole here */ + } + + /* Here we have a leaf attached to (head)->b[EB_RGHT] */ + if (head->bit < -1) { + /* A hole exists just before the leaf, we insert there */ + new->bit = -1; + sub = container_of(eb_untag(head->branches.b[EB_RGHT], EB_LEAF), + struct eb_node, branches); + head->branches.b[EB_RGHT] = eb_dotag(&new->branches, EB_NODE); + + new->node_p = sub->leaf_p; + new->leaf_p = new_rght; + sub->leaf_p = new_left; + new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_LEAF); + new->branches.b[EB_RGHT] = new_leaf; + return new; + } else { + int side; + /* No hole was found before a leaf. We have to insert above + * <sub>. Note that we cannot be certain that <sub> is attached + * to the right of its parent, as this is only true if <sub> + * is inside the dup tree, not at the head. + */ + new->bit = sub->bit - 1; /* install at the lowest level */ + side = eb_gettag(sub->node_p); + head = container_of(eb_untag(sub->node_p, side), struct eb_node, branches); + head->branches.b[side] = eb_dotag(&new->branches, EB_NODE); + + new->node_p = sub->node_p; + new->leaf_p = new_rght; + sub->node_p = new_left; + new->branches.b[EB_LEFT] = eb_dotag(&sub->branches, EB_NODE); + new->branches.b[EB_RGHT] = new_leaf; + return new; + } +} + + +/**************************************\ + * Public functions, for the end-user * +\**************************************/ + +/* Return non-zero if the tree is empty, otherwise zero */ +static inline int eb_is_empty(const struct eb_root *root) +{ + return !root->b[EB_LEFT]; +} + +/* Return non-zero if the node is a duplicate, otherwise zero */ +static inline int eb_is_dup(const struct eb_node *node) +{ + return node->bit < 0; +} + +/* Return the first leaf in the tree starting at <root>, or NULL if none */ +static inline struct eb_node *eb_first(struct eb_root *root) +{ + return eb_walk_down(root->b[0], EB_LEFT); +} + +/* Return the last leaf in the tree starting at <root>, or NULL if none */ +static inline struct eb_node *eb_last(struct eb_root *root) +{ + return eb_walk_down(root->b[0], EB_RGHT); +} + +/* Return previous leaf node before an existing leaf node, or NULL if none. */ +static inline struct eb_node *eb_prev(struct eb_node *node) +{ + eb_troot_t *t = node->leaf_p; + + while (eb_gettag(t) == EB_LEFT) { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(t, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + t = (eb_root_to_node(eb_untag(t, EB_LEFT)))->node_p; + } + /* Note that <t> cannot be NULL at this stage */ + t = (eb_untag(t, EB_RGHT))->b[EB_LEFT]; + return eb_walk_down(t, EB_RGHT); +} + +/* Return next leaf node after an existing leaf node, or NULL if none. */ +static inline struct eb_node *eb_next(struct eb_node *node) +{ + eb_troot_t *t = node->leaf_p; + + while (eb_gettag(t) != EB_LEFT) + /* Walking up from right branch, so we cannot be below root */ + t = (eb_root_to_node(eb_untag(t, EB_RGHT)))->node_p; + + /* Note that <t> cannot be NULL at this stage */ + t = (eb_untag(t, EB_LEFT))->b[EB_RGHT]; + if (eb_clrtag(t) == NULL) + return NULL; + return eb_walk_down(t, EB_LEFT); +} + +/* Return previous leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct eb_node *eb_prev_dup(struct eb_node *node) +{ + eb_troot_t *t = node->leaf_p; + + while (eb_gettag(t) == EB_LEFT) { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(t, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + /* if the current node leaves a dup tree, quit */ + if ((eb_root_to_node(eb_untag(t, EB_LEFT)))->bit >= 0) + return NULL; + t = (eb_root_to_node(eb_untag(t, EB_LEFT)))->node_p; + } + /* Note that <t> cannot be NULL at this stage */ + if ((eb_root_to_node(eb_untag(t, EB_RGHT)))->bit >= 0) + return NULL; + t = (eb_untag(t, EB_RGHT))->b[EB_LEFT]; + return eb_walk_down(t, EB_RGHT); +} + +/* Return next leaf node within a duplicate sub-tree, or NULL if none. */ +static inline struct eb_node *eb_next_dup(struct eb_node *node) +{ + eb_troot_t *t = node->leaf_p; + + while (eb_gettag(t) != EB_LEFT) { + /* Walking up from right branch, so we cannot be below root */ + /* if the current node leaves a dup tree, quit */ + if ((eb_root_to_node(eb_untag(t, EB_RGHT)))->bit >= 0) + return NULL; + t = (eb_root_to_node(eb_untag(t, EB_RGHT)))->node_p; + } + + /* Note that <t> cannot be NULL at this stage. If our leaf is directly + * under the root, we must not try to cast the leaf_p into a eb_node* + * since it is a pointer to an eb_root. + */ + if (eb_clrtag((eb_untag(t, EB_LEFT))->b[EB_RGHT]) == NULL) + return NULL; + + if ((eb_root_to_node(eb_untag(t, EB_LEFT)))->bit >= 0) + return NULL; + t = (eb_untag(t, EB_LEFT))->b[EB_RGHT]; + return eb_walk_down(t, EB_LEFT); +} + +/* Return previous leaf node before an existing leaf node, skipping duplicates, + * or NULL if none. */ +static inline struct eb_node *eb_prev_unique(struct eb_node *node) +{ + eb_troot_t *t = node->leaf_p; + + while (1) { + if (eb_gettag(t) != EB_LEFT) { + node = eb_root_to_node(eb_untag(t, EB_RGHT)); + /* if we're right and not in duplicates, stop here */ + if (node->bit >= 0) + break; + t = node->node_p; + } + else { + /* Walking up from left branch. We must ensure that we never + * walk beyond root. + */ + if (unlikely(eb_clrtag((eb_untag(t, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; + t = (eb_root_to_node(eb_untag(t, EB_LEFT)))->node_p; + } + } + /* Note that <t> cannot be NULL at this stage */ + t = (eb_untag(t, EB_RGHT))->b[EB_LEFT]; + return eb_walk_down(t, EB_RGHT); +} + +/* Return next leaf node after an existing leaf node, skipping duplicates, or + * NULL if none. + */ +static inline struct eb_node *eb_next_unique(struct eb_node *node) +{ + eb_troot_t *t = node->leaf_p; + + while (1) { + if (eb_gettag(t) == EB_LEFT) { + if (unlikely(eb_clrtag((eb_untag(t, EB_LEFT))->b[EB_RGHT]) == NULL)) + return NULL; /* we reached root */ + node = eb_root_to_node(eb_untag(t, EB_LEFT)); + /* if we're left and not in duplicates, stop here */ + if (node->bit >= 0) + break; + t = node->node_p; + } + else { + /* Walking up from right branch, so we cannot be below root */ + t = (eb_root_to_node(eb_untag(t, EB_RGHT)))->node_p; + } + } + + /* Note that <t> cannot be NULL at this stage */ + t = (eb_untag(t, EB_LEFT))->b[EB_RGHT]; + if (eb_clrtag(t) == NULL) + return NULL; + return eb_walk_down(t, EB_LEFT); +} + + +/* Removes a leaf node from the tree if it was still in it. Marks the node + * as unlinked. + */ +static forceinline void __eb_delete(struct eb_node *node) +{ + __label__ delete_unlink; + unsigned int pside, gpside, sibtype; + struct eb_node *parent; + struct eb_root *gparent; + + if (!node->leaf_p) + return; + + /* we need the parent, our side, and the grand parent */ + pside = eb_gettag(node->leaf_p); + parent = eb_root_to_node(eb_untag(node->leaf_p, pside)); + + /* We likely have to release the parent link, unless it's the root, + * in which case we only set our branch to NULL. Note that we can + * only be attached to the root by its left branch. + */ + + if (eb_clrtag(parent->branches.b[EB_RGHT]) == NULL) { + /* we're just below the root, it's trivial. */ + parent->branches.b[EB_LEFT] = NULL; + goto delete_unlink; + } + + /* To release our parent, we have to identify our sibling, and reparent + * it directly to/from the grand parent. Note that the sibling can + * either be a link or a leaf. + */ + + gpside = eb_gettag(parent->node_p); + gparent = eb_untag(parent->node_p, gpside); + + gparent->b[gpside] = parent->branches.b[!pside]; + sibtype = eb_gettag(gparent->b[gpside]); + + if (sibtype == EB_LEAF) { + eb_root_to_node(eb_untag(gparent->b[gpside], EB_LEAF))->leaf_p = + eb_dotag(gparent, gpside); + } else { + eb_root_to_node(eb_untag(gparent->b[gpside], EB_NODE))->node_p = + eb_dotag(gparent, gpside); + } + /* Mark the parent unused. Note that we do not check if the parent is + * our own node, but that's not a problem because if it is, it will be + * marked unused at the same time, which we'll use below to know we can + * safely remove it. + */ + parent->node_p = NULL; + + /* The parent node has been detached, and is currently unused. It may + * belong to another node, so we cannot remove it that way. Also, our + * own node part might still be used. so we can use this spare node + * to replace ours if needed. + */ + + /* If our link part is unused, we can safely exit now */ + if (!node->node_p) + goto delete_unlink; + + /* From now on, <node> and <parent> are necessarily different, and the + * <node>'s node part is in use. By definition, <parent> is at least + * below <node>, so keeping its key for the bit string is OK. + */ + + parent->node_p = node->node_p; + parent->branches = node->branches; + parent->bit = node->bit; + + /* We must now update the new node's parent... */ + gpside = eb_gettag(parent->node_p); + gparent = eb_untag(parent->node_p, gpside); + gparent->b[gpside] = eb_dotag(&parent->branches, EB_NODE); + + /* ... and its branches */ + for (pside = 0; pside <= 1; pside++) { + if (eb_gettag(parent->branches.b[pside]) == EB_NODE) { + eb_root_to_node(eb_untag(parent->branches.b[pside], EB_NODE))->node_p = + eb_dotag(&parent->branches, pside); + } else { + eb_root_to_node(eb_untag(parent->branches.b[pside], EB_LEAF))->leaf_p = + eb_dotag(&parent->branches, pside); + } + } + delete_unlink: + /* Now the node has been completely unlinked */ + node->leaf_p = NULL; + return; /* tree is not empty yet */ +} + +/* Compare blocks <a> and <b> byte-to-byte, from bit <ignore> to bit <len-1>. + * Return the number of equal bits between strings, assuming that the first + * <ignore> bits are already identical. It is possible to return slightly more + * than <len> bits if <len> does not stop on a byte boundary and we find exact + * bytes. Note that parts or all of <ignore> bits may be rechecked. It is only + * passed here as a hint to speed up the check. + */ +static forceinline int equal_bits(const unsigned char *a, + const unsigned char *b, + int ignore, int len) +{ + for (ignore >>= 3, a += ignore, b += ignore, ignore <<= 3; + ignore < len; ) { + unsigned char c; + + a++; b++; + ignore += 8; + c = b[-1] ^ a[-1]; + + if (c) { + /* OK now we know that old and new differ at byte <ptr> and that <c> holds + * the bit differences. We have to find what bit is differing and report + * it as the number of identical bits. Note that low bit numbers are + * assigned to high positions in the byte, as we compare them as strings. + */ + ignore -= flsnz8(c); + break; + } + } + return ignore; +} + +/* check that the two blocks <a> and <b> are equal on <len> bits. If it is known + * they already are on some bytes, this number of equal bytes to be skipped may + * be passed in <skip>. It returns 0 if they match, otherwise non-zero. + */ +static forceinline int check_bits(const unsigned char *a, + const unsigned char *b, + int skip, + int len) +{ + int bit, ret; + + /* This uncommon construction gives the best performance on x86 because + * it makes heavy use multiple-index addressing and parallel instructions, + * and it prevents gcc from reordering the loop since it is already + * properly oriented. Tested to be fine with 2.95 to 4.2. + */ + bit = ~len + (skip << 3) + 9; // = (skip << 3) + (8 - len) + ret = a[skip] ^ b[skip]; + if (unlikely(bit >= 0)) + return ret >> bit; + while (1) { + skip++; + if (ret) + return ret; + ret = a[skip] ^ b[skip]; + bit += 8; + if (bit >= 0) + return ret >> bit; + } +} + + +/* Compare strings <a> and <b> byte-to-byte, from bit <ignore> to the last 0. + * Return the number of equal bits between strings, assuming that the first + * <ignore> bits are already identical. Note that parts or all of <ignore> bits + * may be rechecked. It is only passed here as a hint to speed up the check. + * The caller is responsible for not passing an <ignore> value larger than any + * of the two strings. However, referencing any bit from the trailing zero is + * permitted. Equal strings are reported as a negative number of bits, which + * indicates the end was reached. + */ +static forceinline int string_equal_bits(const unsigned char *a, + const unsigned char *b, + int ignore) +{ + int beg; + unsigned char c; + + beg = ignore >> 3; + + /* skip known and identical bits. We stop at the first different byte + * or at the first zero we encounter on either side. + */ + while (1) { + unsigned char d; + + c = a[beg]; + d = b[beg]; + beg++; + + c ^= d; + if (c) + break; + if (!d) + return -1; + } + /* OK now we know that a and b differ at byte <beg>, or that both are zero. + * We have to find what bit is differing and report it as the number of + * identical bits. Note that low bit numbers are assigned to high positions + * in the byte, as we compare them as strings. + */ + return (beg << 3) - flsnz8(c); +} + +static forceinline int cmp_bits(const unsigned char *a, const unsigned char *b, unsigned int pos) +{ + unsigned int ofs; + unsigned char bit_a, bit_b; + + ofs = pos >> 3; + pos = ~pos & 7; + + bit_a = (a[ofs] >> pos) & 1; + bit_b = (b[ofs] >> pos) & 1; + + return bit_a - bit_b; /* -1: a<b; 0: a=b; 1: a>b */ +} + +static forceinline int get_bit(const unsigned char *a, unsigned int pos) +{ + unsigned int ofs; + + ofs = pos >> 3; + pos = ~pos & 7; + return (a[ofs] >> pos) & 1; +} + +/* These functions are declared in ebtree.c */ +void eb_delete(struct eb_node *node); +struct eb_node *eb_insert_dup(struct eb_node *sub, struct eb_node *new); +int eb_memcmp(const void *m1, const void *m2, size_t len); + +#endif /* _EB_TREE_H */ + +/* + * Local variables: + * c-indent-level: 8 + * c-basic-offset: 8 + * End: + */ diff --git a/include/import/ist.h b/include/import/ist.h new file mode 100644 index 0000000..16b8616 --- /dev/null +++ b/include/import/ist.h @@ -0,0 +1,957 @@ +/* + * include/import/ist.h + * Very simple indirect string manipulation functions. + * + * Copyright (C) 2014-2020 Willy Tarreau - w@1wt.eu + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _IMPORT_IST_H +#define _IMPORT_IST_H + +#include <sys/types.h> +#include <ctype.h> +#include <stddef.h> +#include <string.h> + +#ifndef IST_FREESTANDING +#include <stdlib.h> +#endif + +/* ASCII to lower case conversion table */ +#define _IST_LC { \ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, \ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, \ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, \ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, \ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, \ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, \ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, \ + 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, \ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, \ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, \ + 0x78, 0x79, 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, \ + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, \ + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, \ + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, \ + 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, \ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, \ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, \ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, \ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, \ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, \ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, \ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, \ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, \ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, \ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, \ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, \ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, \ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, \ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, \ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, \ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, \ +} + +/* ASCII to upper case conversion table */ +#define _IST_UC { \ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, \ + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, \ + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, \ + 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, \ + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, \ + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, \ + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, \ + 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, \ + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, \ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, \ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, \ + 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, \ + 0x60, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, \ + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, \ + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, \ + 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, \ + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, \ + 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, \ + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, \ + 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, \ + 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, \ + 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, \ + 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, \ + 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, \ + 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, \ + 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, \ + 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, \ + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, \ + 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, \ + 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, \ + 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, \ + 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, \ +} + +#if defined(USE_OBSOLETE_LINKER) || defined(__TINYC__) +/* some old linkers and some non-ELF platforms have issues with the weak + * attribute so we turn these arrays to literals there. TCC silently ignores + * it so we switch to literal as well. + */ +#define ist_lc ((const unsigned char[256])_IST_LC) +#define ist_uc ((const unsigned char[256])_IST_UC) +#else +const unsigned char ist_lc[256] __attribute__((weak)) = _IST_LC; +const unsigned char ist_uc[256] __attribute__((weak)) = _IST_UC; +#endif + +/* This string definition will most often be used to represent a read-only + * string returned from a function, based on the starting point and its length + * in bytes. No storage is provided, only a pointer and a length. The types + * here are important as we only want to have 2 native machine words there so + * that on modern architectures the compiler is capable of efficiently + * returning a register pair without having to allocate stack room from the + * caller. This is done with -freg-struct which is often enabled by default. + */ +struct ist { + char *ptr; + size_t len; +}; + +/* makes a constant ist from a constant string, for use in array declarations */ +#define IST(str) { .ptr = str "", .len = (sizeof str "") - 1 } + +/* IST_NULL is equivalent to an `ist` with `.ptr = NULL` and `.len = 0` */ +#define IST_NULL ((const struct ist){ .ptr = 0, .len = 0 }) + +/* makes an ist from a regular zero terminated string. Null has length 0. + * Constants are detected and replaced with constant initializers. Other values + * are measured by hand without strlen() as it's much cheaper and inlinable on + * small strings. The construct is complex because we must never call + * __builtin_strlen() with an expression otherwise it involves a real + * measurement. + */ +#if __GNUC__ >= 4 +// gcc >= 4 detects constant propagation of str through __x and resolves the +// length of constant strings easily. +#define ist(str) ({ \ + char *__x = (void *)(str); \ + (struct ist){ \ + .ptr = __x, \ + .len = __builtin_constant_p(str) ? \ + ((void *)str == (void *)0) ? 0 : \ + __builtin_strlen(__x) : \ + ({ \ + size_t __l = 0; \ + if (__x) for (__l--; __x[++__l]; ) ; \ + __l; \ + }) \ + }; \ +}) +#else +// gcc < 4 can't do this, and the side effect is a warning each time a NULL is +// passed to ist() due to the check on __builtin_strlen(). It doesn't have the +// ability to know that this code is never called. +#define ist(str) ({ \ + char *__x = (void *)(str); \ + (struct ist){ \ + .ptr = __x, \ + .len = __builtin_constant_p(str) ? \ + ((void *)str == (void *)0) ? 0 : \ + __builtin_strlen(str) : \ + ({ \ + size_t __l = 0; \ + if (__x) for (__l--; __x[++__l]; ) ; \ + __l; \ + }) \ + }; \ +}) +#endif + +/* makes an ist struct from a string and a length */ +static inline struct ist ist2(const void *ptr, size_t len) +{ + return (struct ist){ .ptr = (char *)ptr, .len = len }; +} + +/* returns the result of `ist.ptr != NULL` */ +static inline int isttest(const struct ist ist) +{ + return ist.ptr != NULL; +} + +/* This function MODIFIES the string to add a zero AFTER the end, and returns + * the start pointer. The purpose is to use it on strings extracted by parsers + * from larger strings cut with delimiters that are not important and can be + * destroyed. It allows any such string to be used with regular string + * functions. It's also convenient to use with printf() to show data extracted + * from writable areas. The caller is obviously responsible for ensuring that + * the string is valid and that the first byte past the end is writable. If + * these conditions cannot be satisfied, use istpad() below instead. + */ +static inline char *ist0(struct ist ist) +{ + ist.ptr[ist.len] = 0; + return ist.ptr; +} + +/* returns the pointer of the string */ +static inline char *istptr(const struct ist ist) +{ + return ist.ptr; +} + +/* returns the length of the string */ +static inline size_t istlen(const struct ist ist) +{ + return ist.len; +} + +/* returns the pointer to the end the string */ +static inline char *istend(const struct ist ist) +{ + return (ist.ptr + ist.len); +} + +/* skips to next character in the string, always stops at the end */ +static inline struct ist istnext(const struct ist ist) +{ + struct ist ret = ist; + + if (ret.len) { + ret.len--; + ret.ptr++; + } + return ret; +} + +/* Returns the first character of the <ist> and advances the <ist> by 1. + * If the <ist> is empty the result is undefined. + */ +static inline char istshift(struct ist *ist) +{ + if (ist->len) { + char c = *ist->ptr; + *ist = istnext(*ist); + + return c; + } + + return 0; +} + +/* copies the contents from string <ist> to buffer <buf> and adds a trailing + * zero. The caller must ensure <buf> is large enough. + */ +static inline struct ist istpad(void *buf, const struct ist ist) +{ + struct ist ret = { .ptr = buf, .len = ist.len }; + + for (ret.len = 0; ret.len < ist.len; ret.len++) + ret.ptr[ret.len] = ist.ptr[ret.len]; + + ret.ptr[ret.len] = 0; + return ret; +} + +/* trims string <ist> to no more than <size> characters. The string is + * returned. + */ +static inline struct ist isttrim(const struct ist ist, size_t size) +{ + struct ist ret = ist; + + if (ret.len > size) + ret.len = size; + return ret; +} + +/* Sets the <len> of the <ist> to zero and returns the previous length. + * + * This function is meant to be used in functions that receive an ist containing + * the destination buffer and the buffer's size. The returned size must be stored + * to prevent an overflow of such a destination buffer. + * + * If you simply want to clear an ist and do not care about the previous length + * then you should use `isttrim(ist, 0)`. + * + * Example Usage (fill the complete buffer with 'x'): + * + * void my_func(struct ist* dst) + * { + * size_t dst_size = istclear(dst); + * size_t i; + * + * for (i = 0; i < dst_size; i++) + * *dst = __istappend(*dst, 'x'); + * } + */ +__attribute__((warn_unused_result)) +static inline size_t istclear(struct ist* ist) +{ + size_t len = ist->len; + + ist->len = 0; + + return len; +} + +/* trims string <ist> to no more than <size>-1 characters and ensures that a + * zero is placed after <ist.len> (possibly reduced by one) and before <size>, + * unless <size> is already zero. The string is returned. This is mostly aimed + * at building printable strings that need to be zero-terminated. + */ +static inline struct ist istzero(const struct ist ist, size_t size) +{ + struct ist ret = ist; + + if (!size) + ret.len = 0; + else { + if (ret.len > size - 1) + ret.len = size - 1; + ret.ptr[ret.len] = 0; + } + return ret; +} + +/* returns the ordinal difference between two strings : + * < 0 if ist1 < ist2 + * = 0 if ist1 == ist2 + * > 0 if ist1 > ist2 + */ +static inline int istdiff(const struct ist ist1, const struct ist ist2) +{ + struct ist l = ist1; + struct ist r = ist2; + + do { + if (!l.len--) + return -r.len; + if (!r.len--) + return 1; + } while (*l.ptr++ == *r.ptr++); + + return *(unsigned char *)(l.ptr - 1) - *(unsigned char *)(r.ptr - 1); +} + +/* returns non-zero if <ist1> starts like <ist2> (empty strings do match) */ +static inline int istmatch(const struct ist ist1, const struct ist ist2) +{ + struct ist l = ist1; + struct ist r = ist2; + + if (l.len < r.len) + return 0; + + while (r.len--) { + if (*l.ptr++ != *r.ptr++) + return 0; + } + return 1; +} + +/* returns non-zero if <ist1> starts like <ist2>, ignoring the case (empty strings do match) */ +static inline int istmatchi(const struct ist ist1, const struct ist ist2) +{ + struct ist l = ist1; + struct ist r = ist2; + + if (l.len < r.len) + return 0; + + while (r.len--) { + if (*l.ptr != *r.ptr && + ist_lc[(unsigned char)*l.ptr] != ist_lc[(unsigned char)*r.ptr]) + return 0; + + l.ptr++; + r.ptr++; + } + return 1; +} + +/* returns non-zero if <ist1> starts like <ist2> on the first <count> + * characters (empty strings do match). + */ +static inline int istnmatch(const struct ist ist1, const struct ist ist2, size_t count) +{ + struct ist l = ist1; + struct ist r = ist2; + + if (l.len > count) + l.len = count; + if (r.len > count) + r.len = count; + return istmatch(l, r); +} + +/* returns non-zero if <ist1> equals <ist2> (empty strings are equal) */ +static inline int isteq(const struct ist ist1, const struct ist ist2) +{ + struct ist l = ist1; + struct ist r = ist2; + + if (l.len != r.len) + return 0; + + while (l.len--) { + if (*l.ptr++ != *r.ptr++) + return 0; + } + return 1; +} + +/* returns non-zero if <ist1> equals <ist2>, ignoring the case (empty strings are equal) */ +static inline int isteqi(const struct ist ist1, const struct ist ist2) +{ + struct ist l = ist1; + struct ist r = ist2; + + if (l.len != r.len) + return 0; + + while (l.len--) { + if (*l.ptr != *r.ptr && + ist_lc[(unsigned char)*l.ptr] != ist_lc[(unsigned char)*r.ptr]) + return 0; + + l.ptr++; + r.ptr++; + } + return 1; +} + +/* returns non-zero if <ist1> equals <ist2> on the first <count> characters + * (empty strings are equal). + */ +static inline int istneq(const struct ist ist1, const struct ist ist2, size_t count) +{ + struct ist l = ist1; + struct ist r = ist2; + + if (l.len > count) + l.len = count; + if (r.len > count) + r.len = count; + return isteq(l, r); +} + +/* appends <src> after <dst>. The caller must ensure that the underlying buffer + * is large enough to fit the character. + */ +static inline struct ist __istappend(struct ist dst, const char src) +{ + dst.ptr[dst.len++] = src; + + return dst; +} + +/* copies <src> over <dst> for a maximum of <count> bytes. Returns the number + * of characters copied (src.len), or -1 if it does not fit. In all cases, the + * contents are copied prior to reporting an error, so that the destination + * at least contains a valid but truncated string. + */ +static inline ssize_t istcpy(struct ist *dst, const struct ist src, size_t count) +{ + dst->len = 0; + + if (count > src.len) + count = src.len; + + while (dst->len < count) { + dst->ptr[dst->len] = src.ptr[dst->len]; + dst->len++; + } + + if (dst->len == src.len) + return src.len; + + return -1; +} + +/* copies <src> over <dst> for a maximum of <count> bytes. Returns the number + * of characters copied, or -1 if it does not fit. A (possibly truncated) valid + * copy of <src> is always left into <dst>, and a trailing \0 is appended as + * long as <count> is not null, even if that results in reducing the string by + * one character. + */ +static inline ssize_t istscpy(struct ist *dst, const struct ist src, size_t count) +{ + dst->len = 0; + + if (!count) + goto fail; + + if (count > src.len) + count = src.len + 1; + + while (dst->len < count - 1) { + dst->ptr[dst->len] = src.ptr[dst->len]; + dst->len++; + } + + dst->ptr[dst->len] = 0; + if (dst->len == src.len) + return src.len; + fail: + return -1; +} + +/* appends <src> after <dst> for a maximum of <count> total bytes in <dst> after + * the copy. <dst> is assumed to be <count> or less before the call. The new + * string's length is returned, or -1 if a truncation happened. In all cases, + * the contents are copied prior to reporting an error, so that the destination + * at least contains a valid but truncated string. + */ +static inline ssize_t istcat(struct ist *dst, const struct ist src, size_t count) +{ + const char *s = src.ptr; + + while (dst->len < count && s != src.ptr + src.len) + dst->ptr[dst->len++] = *s++; + + if (s == src.ptr + src.len) + return dst->len; + + return -1; +} + +/* appends <src> after <dst> for a maximum of <count> total bytes in <dst> after + * the copy. <dst> is assumed to be <count> or less before the call. The new + * string's length is returned, or -1 if a truncation happened. In all cases, + * the contents are copied prior to reporting an error, so that the destination + * at least contains a valid but truncated string. + */ +static inline ssize_t istscat(struct ist *dst, const struct ist src, size_t count) +{ + const char *s = src.ptr; + + if (!count) + goto fail; + + while (dst->len < count - 1 && s != src.ptr + src.len) { + dst->ptr[dst->len++] = *s++; + } + + dst->ptr[dst->len] = 0; + if (s == src.ptr + src.len) + return dst->len; + fail: + return -1; +} + +/* copies the entire <src> over <dst>, which must be allocated large enough to + * hold the whole contents. No trailing zero is appended, this is mainly used + * for protocol processing where the frame length has already been checked. An + * ist made of the output and its length are returned. The destination is not + * touched if src.len is null. + */ +static inline struct ist ist2bin(char *dst, const struct ist src) +{ + size_t ofs = 0; + + /* discourage the compiler from trying to optimize for large strings, + * but tell it that most of our strings are not empty. + */ + if (__builtin_expect(ofs < src.len, 1)) { + do { + dst[ofs] = src.ptr[ofs]; + ofs++; + } while (__builtin_expect(ofs < src.len, 0)); + } + return ist2(dst, ofs); +} + +/* copies the entire <src> over <dst>, which must be allocated large enough to + * hold the whole contents as well as a trailing zero which is always appended. + * This is mainly used for protocol conversions where the frame length has + * already been checked. An ist made of the output and its length (not counting + * the trailing zero) are returned. + */ +static inline struct ist ist2str(char *dst, const struct ist src) +{ + size_t ofs = 0; + + /* discourage the compiler from trying to optimize for large strings, + * but tell it that most of our strings are not empty. + */ + if (__builtin_expect(ofs < src.len, 1)) { + do { + dst[ofs] = src.ptr[ofs]; + ofs++; + } while (__builtin_expect(ofs < src.len, 0)); + } + dst[ofs] = 0; + return ist2(dst, ofs); +} + +/* makes a lower case copy of the entire <src> into <dst>, which must have been + * allocated large enough to hold the whole contents. No trailing zero is + * appended, this is mainly used for protocol processing where the frame length + * has already been checked. An ist made of the output and its length are + * returned. The destination is not touched if src.len is null. + */ +static inline struct ist ist2bin_lc(char *dst, const struct ist src) +{ + size_t ofs = 0; + + /* discourage the compiler from trying to optimize for large strings, + * but tell it that most of our strings are not empty. + */ + if (__builtin_expect(ofs < src.len, 1)) { + do { + dst[ofs] = ist_lc[(unsigned char)src.ptr[ofs]]; + ofs++; + } while (__builtin_expect(ofs < src.len, 0)); + } + return ist2(dst, ofs); +} + +/* makes a lower case copy of the entire <src> into <dst>, which must have been + * allocated large enough to hold the whole contents as well as a trailing zero + * which is always appended. This is mainly used for protocol conversions where + * the frame length has already been checked. An ist made of the output and its + * length (not counting the trailing zero) are returned. + */ +static inline struct ist ist2str_lc(char *dst, const struct ist src) +{ + size_t ofs = 0; + + /* discourage the compiler from trying to optimize for large strings, + * but tell it that most of our strings are not empty. + */ + if (__builtin_expect(ofs < src.len, 1)) { + do { + dst[ofs] = ist_lc[(unsigned char)src.ptr[ofs]]; + ofs++; + } while (__builtin_expect(ofs < src.len, 0)); + } + dst[ofs] = 0; + return ist2(dst, ofs); +} + +/* makes an upper case copy of the entire <src> into <dst>, which must have + * been allocated large enough to hold the whole contents. No trailing zero is + * appended, this is mainly used for protocol processing where the frame length + * has already been checked. An ist made of the output and its length are + * returned. The destination is not touched if src.len is null. + */ +static inline struct ist ist2bin_uc(char *dst, const struct ist src) +{ + size_t ofs = 0; + + /* discourage the compiler from trying to optimize for large strings, + * but tell it that most of our strings are not empty. + */ + if (__builtin_expect(ofs < src.len, 1)) { + do { + dst[ofs] = ist_uc[(unsigned char)src.ptr[ofs]]; + ofs++; + } while (__builtin_expect(ofs < src.len, 0)); + } + return ist2(dst, ofs); +} + +/* makes an upper case copy of the entire <src> into <dst>, which must have been + * allocated large enough to hold the whole contents as well as a trailing zero + * which is always appended. This is mainly used for protocol conversions where + * the frame length has already been checked. An ist made of the output and its + * length (not counting the trailing zero) are returned. + */ +static inline struct ist ist2str_uc(char *dst, const struct ist src) +{ + size_t ofs = 0; + + /* discourage the compiler from trying to optimize for large strings, + * but tell it that most of our strings are not empty. + */ + if (__builtin_expect(ofs < src.len, 1)) { + do { + dst[ofs] = ist_uc[(unsigned char)src.ptr[ofs]]; + ofs++; + } while (__builtin_expect(ofs < src.len, 0)); + } + dst[ofs] = 0; + return ist2(dst, ofs); +} + +/* looks for first occurrence of character <chr> in string <ist>. Returns the + * pointer if found, or NULL if not found. + */ +static inline char *istchr(const struct ist ist, char chr) +{ + char *s = ist.ptr; + + do { + if (s >= ist.ptr + ist.len) + return NULL; + } while (*s++ != chr); + return s - 1; +} + +/* Returns a pointer to the first control character found in <ist>, or NULL if + * none is present. A control character is defined as a byte whose value is + * between 0x00 and 0x1F included. The function is optimized for strings having + * no CTL chars by processing up to sizeof(long) bytes at once on architectures + * supporting efficient unaligned accesses. Despite this it is not very fast + * (~0.43 byte/cycle) and should mostly be used on low match probability when + * it can save a call to a much slower function. + */ +static inline const char *ist_find_ctl(const struct ist ist) +{ + const union { unsigned long v; } __attribute__((packed)) *u; + const char *curr = (void *)ist.ptr - sizeof(long); + const char *last = curr + ist.len; + unsigned long l1, l2; + + do { + curr += sizeof(long); + if (curr > last) + break; + u = (void *)curr; + /* subtract 0x202020...20 to the value to generate a carry in + * the lower byte if the byte contains a lower value. If we + * generate a bit 7 that was not there, it means the byte was + * within 0x00..0x1F. + */ + l2 = u->v; + l1 = ~l2 & ((~0UL / 255) * 0x80); /* 0x808080...80 */ + l2 -= (~0UL / 255) * 0x20; /* 0x202020...20 */ + } while ((l1 & l2) == 0); + + last += sizeof(long); + if (__builtin_expect(curr < last, 0)) { + do { + if ((unsigned char)*curr < 0x20) + return curr; + curr++; + } while (curr < last); + } + return NULL; +} + +/* Returns a pointer to the first character found <ist> that belongs to the + * range [min:max] inclusive, or NULL if none is present. The function is + * optimized for strings having no such chars by processing up to sizeof(long) + * bytes at once on architectures supporting efficient unaligned accesses. + * Despite this it is not very fast (~0.43 byte/cycle) and should mostly be + * used on low match probability when it can save a call to a much slower + * function. Will not work for characters 0x80 and above. It's optimized for + * min and max to be known at build time. + */ +static inline const char *ist_find_range(const struct ist ist, unsigned char min, unsigned char max) +{ + const union { unsigned long v; } __attribute__((packed)) *u; + const char *curr = (void *)ist.ptr - sizeof(long); + const char *last = curr + ist.len; + unsigned long l1, l2; + + /* easier with an exclusive boundary */ + max++; + + do { + curr += sizeof(long); + if (curr > last) + break; + u = (void *)curr; + /* add 0x<min><min><min><min>..<min> then subtract + * 0x<max><max><max><max>..<max> to the value to generate a + * carry in the lower byte if the byte contains a lower value. + * If we generate a bit 7 that was not there, it means the byte + * was min..max. + */ + l2 = u->v; + l1 = ~l2 & ((~0UL / 255) * 0x80); /* 0x808080...80 */ + l2 += (~0UL / 255) * min; /* 0x<min><min>..<min> */ + l2 -= (~0UL / 255) * max; /* 0x<max><max>..<max> */ + } while ((l1 & l2) == 0); + + last += sizeof(long); + if (__builtin_expect(curr < last, 0)) { + do { + if ((unsigned char)(*curr - min) < (unsigned char)(max - min)) + return curr; + curr++; + } while (curr < last); + } + return NULL; +} + +/* looks for first occurrence of character <chr> in string <ist> and returns + * the tail of the string starting with this character, or (ist.end,0) if not + * found. + */ +static inline struct ist istfind(const struct ist ist, char chr) +{ + struct ist ret = ist; + + while (ret.len--) { + if (*ret.ptr++ == chr) + return ist2(ret.ptr - 1, ret.len + 1); + } + return ist2(ret.ptr, 0); +} + +/* looks for first occurrence of character different from <chr> in string <ist> + * and returns the tail of the string starting at this character, or (ist_end,0) + * if not found. + */ +static inline struct ist istskip(const struct ist ist, char chr) +{ + struct ist ret = ist; + + while (ret.len--) { + if (*ret.ptr++ != chr) + return ist2(ret.ptr - 1, ret.len + 1); + } + return ist2(ret.ptr, 0); +} + +/* looks for first occurrence of string <pat> in string <ist> and returns the + * tail of the string starting at this position, or (NULL,0) if not found. The + * empty pattern is found everywhere. + */ +static inline struct ist istist(const struct ist ist, const struct ist pat) +{ + struct ist ret = ist; + size_t pos; + + if (!pat.len) + return ret; + + while (1) { + loop: + ret = istfind(ret, *pat.ptr); + if (ret.len < pat.len) + break; + + /* ret.len >= 1, pat.len >= 1 and *ret.ptr == *pat.ptr */ + + ret = istnext(ret); + for (pos = 0; pos < pat.len - 1; ) { + ++pos; + if (ret.ptr[pos - 1] != pat.ptr[pos]) + goto loop; + } + return ist2(ret.ptr - 1, ret.len + 1); + } + return IST_NULL; +} + +/* + * looks for the first occurrence of <chr> in string <ist> and returns a shorter + * ist if char is found. + */ +static inline struct ist iststop(const struct ist ist, char chr) +{ + size_t len = 0; + + while (len++ < ist.len && ist.ptr[len - 1] != chr) + ; + return ist2(ist.ptr, len - 1); +} + +/* + * advance <.ptr> by <nb> characters. + * If <ist> is too short, (ist.end,0) is returned. + */ +static inline struct ist istadv(const struct ist ist, const size_t nb) +{ + if (ist.len < nb) + return ist2(ist.ptr + ist.len, 0); + return ist2(ist.ptr + nb, ist.len - nb); +} + +/* Splits the given <ist> at the given character. The returned ist is + * equivalent to iststop(ist, delim). The passed <ist> will contain the + * remainder of the string, not including the delimiter. In other words + * it will be advanced by the length of the returned string plus 1. + */ +static inline struct ist istsplit(struct ist *ist, char delim) +{ + const struct ist result = iststop(*ist, delim); + + *ist = istadv(*ist, result.len + 1); + + return result; +} + +/* + * compare 2 ists and return non-zero if they are the same + */ +static inline int istissame(const struct ist ist1, const struct ist ist2) +{ + return ((ist1.ptr == ist2.ptr) && (ist1.len == ist2.len)); +} + +#ifndef IST_FREESTANDING +/* This function allocates <size> bytes and returns an `ist` pointing to + * the allocated area with size `0`. + * + * If this function fails to allocate memory the return value is equivalent + * to IST_NULL. + */ +static inline struct ist istalloc(const size_t size) +{ + /* Note: do not use ist2 here, as it triggers a gcc11 warning. + * €˜<unknown>€™ may be used uninitialized [-Werror=maybe-uninitialized] + * + * This warning is reported because the uninitialized memory block + * allocated by malloc should not be passed to a const argument as in + * ist2. + * See https://gcc.gnu.org/onlinedocs/gcc-11.1.0/gcc/Warning-Options.html#index-Wmaybe-uninitialized + */ + return (struct ist){ .ptr = malloc(size), .len = 0 }; +} + +/* This function performs the equivalent of free() on the given <ist>. + * + * After this function returns the value of the given <ist> will be + * modified to be equivalent to IST_NULL. + */ +static inline void istfree(struct ist *ist) +{ + free(ist->ptr); + *ist = IST_NULL; +} + +/* This function performs the equivalent of strdup() on the given <src>. + * + * If this function fails to allocate memory the return value is equivalent + * to IST_NULL. + */ +static inline struct ist istdup(const struct ist src) +{ + const size_t src_size = src.len; + + /* Allocate at least 1 byte to allow duplicating an empty string with + * malloc implementations that return NULL for a 0-size allocation. + */ + struct ist dst = istalloc(src_size ? src_size : 1); + + if (isttest(dst)) { + istcpy(&dst, src, src_size); + } + + return dst; +} +#endif + +#endif diff --git a/include/import/lru.h b/include/import/lru.h new file mode 100644 index 0000000..d674e53 --- /dev/null +++ b/include/import/lru.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2015 Willy Tarreau <w@1wt.eu> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <import/eb64tree.h> + +/* The LRU supports a global cache shared between multiple domains and multiple + * versions of their datasets. The purpose is not to have to flush the whole + * LRU once a key is updated and not valid anymore (eg: ACL files), as well as + * to reliably support concurrent accesses and handle conflicts gracefully. For + * each key a pointer to a dataset and its internal data revision are stored. + * All lookups verify that these elements match those passed by the caller and + * only return a valid entry upon matching. Otherwise the entry is either + * allocated or recycled and considered new. New entries are always initialized + * with a NULL domain pointer which is used by the caller to detect that the + * entry is new and must be populated. Such entries never expire and are + * protected from the risk of being recycled. It's then the caller's + * responsibility to perform the operation and commit the entry with its latest + * result. This domain thus serves as a lock to protect the entry during all + * the computation needed to update it. In a simple use case where the cache is + * dedicated, it is recommended to pass the LRU head as the domain pointer and + * for example zero as the revision. The most common use case for the caller + * consists in simply checking that the return is not null and that the domain + * is not null, then to use the result. The get() function returns null if it + * cannot allocate a node (memory or key being currently updated). + */ +struct lru64_list { + struct lru64_list *n; + struct lru64_list *p; +}; + +struct lru64_head { + struct lru64_list list; + struct eb_root keys; + struct lru64 *spare; + int cache_size; + int cache_usage; +}; + +struct lru64 { + struct eb64_node node; /* indexing key, typically a hash64 */ + struct lru64_list lru; /* LRU list */ + void *domain; /* who this data belongs to */ + unsigned long long revision; /* data revision (to avoid use-after-free) */ + void *data; /* returned value, user decides how to use this */ + void (*free)(void *data); /* function to release data, if needed */ +}; + + +struct lru64 *lru64_lookup(unsigned long long key, struct lru64_head *lru, void *domain, unsigned long long revision); +struct lru64 *lru64_get(unsigned long long key, struct lru64_head *lru, void *domain, unsigned long long revision); +void lru64_commit(struct lru64 *elem, void *data, void *domain, unsigned long long revision, void (*free)(void *)); +struct lru64_head *lru64_new(int size); +int lru64_destroy(struct lru64_head *lru); +void lru64_kill_oldest(struct lru64_head *lru, unsigned long int nb); diff --git a/include/import/mjson.h b/include/import/mjson.h new file mode 100644 index 0000000..b96fd3f --- /dev/null +++ b/include/import/mjson.h @@ -0,0 +1,209 @@ +// Copyright (c) 2018-2020 Cesanta Software Limited +// All rights reserved +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef MJSON_H +#define MJSON_H + +#include <stdarg.h> +#include <stdlib.h> +#include <string.h> + +#ifndef MJSON_ENABLE_PRINT +#define MJSON_ENABLE_PRINT 1 +#endif + +#ifndef MJSON_ENABLE_RPC +#define MJSON_ENABLE_RPC 1 +#endif + +#ifndef MJSON_ENABLE_BASE64 +#define MJSON_ENABLE_BASE64 1 +#endif + +#ifndef MJSON_ENABLE_MERGE +#define MJSON_ENABLE_MERGE 0 +#elif MJSON_ENABLE_MERGE +#define MJSON_ENABLE_NEXT 1 +#endif + +#ifndef MJSON_ENABLE_PRETTY +#define MJSON_ENABLE_PRETTY 0 +#elif MJSON_ENABLE_PRETTY +#define MJSON_ENABLE_NEXT 1 +#endif + +#ifndef MJSON_ENABLE_NEXT +#define MJSON_ENABLE_NEXT 0 +#endif + +#ifndef MJSON_RPC_LIST_NAME +#define MJSON_RPC_LIST_NAME "rpc.list" +#endif + +#ifndef MJSON_DYNBUF_CHUNK +#define MJSON_DYNBUF_CHUNK 256 // Allocation granularity for print_dynamic_buf +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + MJSON_ERROR_INVALID_INPUT = -1, + MJSON_ERROR_TOO_DEEP = -2, +}; + +enum mjson_tok { + MJSON_TOK_INVALID = 0, + MJSON_TOK_KEY = 1, + MJSON_TOK_STRING = 11, + MJSON_TOK_NUMBER = 12, + MJSON_TOK_TRUE = 13, + MJSON_TOK_FALSE = 14, + MJSON_TOK_NULL = 15, + MJSON_TOK_ARRAY = 91, + MJSON_TOK_OBJECT = 123, +}; +#define MJSON_TOK_IS_VALUE(t) ((t) > 10 && (t) < 20) + +typedef int (*mjson_cb_t)(int ev, const char *s, int off, int len, void *ud); + +#ifndef MJSON_MAX_DEPTH +#define MJSON_MAX_DEPTH 20 +#endif + +int mjson(const char *s, int len, mjson_cb_t cb, void *ud); +enum mjson_tok mjson_find(const char *s, int len, const char *jp, + const char **tokptr, int *toklen); +int mjson_get_number(const char *s, int len, const char *path, double *v); +int mjson_get_bool(const char *s, int len, const char *path, int *v); +int mjson_get_string(const char *s, int len, const char *path, char *to, int n); +int mjson_get_hex(const char *s, int len, const char *path, char *to, int n); + +#if MJSON_ENABLE_NEXT +int mjson_next(const char *s, int n, int off, int *koff, int *klen, int *voff, + int *vlen, int *vtype); +#endif + +#if MJSON_ENABLE_BASE64 +int mjson_get_base64(const char *s, int len, const char *path, char *to, int n); +int mjson_base64_dec(const char *src, int n, char *dst, int dlen); +#endif + +#if MJSON_ENABLE_PRINT +typedef int (*mjson_print_fn_t)(const char *buf, int len, void *userdata); +typedef int (*mjson_vprint_fn_t)(mjson_print_fn_t, void *, va_list *); + +struct mjson_fixedbuf { + char *ptr; + int size, len; +}; + +int mjson_printf(mjson_print_fn_t, void *, const char *fmt, ...); +int mjson_vprintf(mjson_print_fn_t, void *, const char *fmt, va_list ap); +int mjson_print_str(mjson_print_fn_t, void *, const char *s, int len); +int mjson_print_int(mjson_print_fn_t, void *, int value, int is_signed); +int mjson_print_long(mjson_print_fn_t, void *, long value, int is_signed); +int mjson_print_buf(mjson_print_fn_t fn, void *, const char *buf, int len); + +int mjson_print_null(const char *ptr, int len, void *userdata); +int mjson_print_fixed_buf(const char *ptr, int len, void *userdata); +int mjson_print_dynamic_buf(const char *ptr, int len, void *userdata); + +#if MJSON_ENABLE_PRETTY +int mjson_pretty(const char *, int, const char *, mjson_print_fn_t, void *); +#endif + +#if MJSON_ENABLE_MERGE +int mjson_merge(const char *, int, const char *, int, mjson_print_fn_t, void *); +#endif + +#endif // MJSON_ENABLE_PRINT + +#if MJSON_ENABLE_RPC + +void jsonrpc_init(mjson_print_fn_t, void *userdata); +int mjson_globmatch(const char *s1, int n1, const char *s2, int n2); + +struct jsonrpc_request { + struct jsonrpc_ctx *ctx; + const char *frame; // Points to the whole frame + int frame_len; // Frame length + const char *params; // Points to the "params" in the request frame + int params_len; // Length of the "params" + const char *id; // Points to the "id" in the request frame + int id_len; // Length of the "id" + const char *method; // Points to the "method" in the request frame + int method_len; // Length of the "method" + mjson_print_fn_t fn; // Printer function + void *fndata; // Printer function data + void *userdata; // Callback's user data as specified at export time +}; + +struct jsonrpc_method { + const char *method; + int method_sz; + void (*cb)(struct jsonrpc_request *); + struct jsonrpc_method *next; +}; + +// Main RPC context, stores current request information and a list of +// exported RPC methods. +struct jsonrpc_ctx { + struct jsonrpc_method *methods; + mjson_print_fn_t response_cb; + void *response_cb_data; +}; + +// Registers function fn under the given name within the given RPC context +#define jsonrpc_ctx_export(ctx, name, fn) \ + do { \ + static struct jsonrpc_method m = {(name), sizeof(name) - 1, (fn), 0}; \ + m.next = (ctx)->methods; \ + (ctx)->methods = &m; \ + } while (0) + +void jsonrpc_ctx_init(struct jsonrpc_ctx *ctx, mjson_print_fn_t, void *); +void jsonrpc_return_error(struct jsonrpc_request *r, int code, + const char *message, const char *data_fmt, ...); +void jsonrpc_return_success(struct jsonrpc_request *r, const char *result_fmt, + ...); +void jsonrpc_ctx_process(struct jsonrpc_ctx *ctx, const char *req, int req_sz, + mjson_print_fn_t fn, void *fndata, void *userdata); + +extern struct jsonrpc_ctx jsonrpc_default_context; + +#define jsonrpc_export(name, fn) \ + jsonrpc_ctx_export(&jsonrpc_default_context, (name), (fn)) + +#define jsonrpc_process(buf, len, fn, fnd, ud) \ + jsonrpc_ctx_process(&jsonrpc_default_context, (buf), (len), (fn), (fnd), (ud)) + +#define JSONRPC_ERROR_INVALID -32700 /* Invalid JSON was received */ +#define JSONRPC_ERROR_NOT_FOUND -32601 /* The method does not exist */ +#define JSONRPC_ERROR_BAD_PARAMS -32602 /* Invalid params passed */ +#define JSONRPC_ERROR_INTERNAL -32603 /* Internal JSON-RPC error */ + +#endif // MJSON_ENABLE_RPC +#ifdef __cplusplus +} +#endif +#endif // MJSON_H diff --git a/include/import/plock.h b/include/import/plock.h new file mode 100644 index 0000000..fc001e2 --- /dev/null +++ b/include/import/plock.h @@ -0,0 +1,1422 @@ +/* plock - progressive locks + * + * Copyright (C) 2012-2017 Willy Tarreau <w@1wt.eu> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef PL_PLOCK_H +#define PL_PLOCK_H + +#include "atomic-ops.h" +#ifdef _POSIX_PRIORITY_SCHEDULING +#include <sched.h> +#endif + +/* 64 bit */ +#define PLOCK64_RL_1 0x0000000000000004ULL +#define PLOCK64_RL_2PL 0x00000000FFFFFFF8ULL +#define PLOCK64_RL_ANY 0x00000000FFFFFFFCULL +#define PLOCK64_SL_1 0x0000000100000000ULL +#define PLOCK64_SL_ANY 0x0000000300000000ULL +#define PLOCK64_WL_1 0x0000000400000000ULL +#define PLOCK64_WL_2PL 0xFFFFFFF800000000ULL +#define PLOCK64_WL_ANY 0xFFFFFFFC00000000ULL + +/* 32 bit */ +#define PLOCK32_RL_1 0x00000004 +#define PLOCK32_RL_2PL 0x0000FFF8 +#define PLOCK32_RL_ANY 0x0000FFFC +#define PLOCK32_SL_1 0x00010000 +#define PLOCK32_SL_ANY 0x00030000 +#define PLOCK32_WL_1 0x00040000 +#define PLOCK32_WL_2PL 0xFFF80000 +#define PLOCK32_WL_ANY 0xFFFC0000 + +/* dereferences <*p> as unsigned long without causing aliasing issues */ +#define pl_deref_long(p) ({ volatile unsigned long *__pl_l = (unsigned long *)(p); *__pl_l; }) + +/* dereferences <*p> as unsigned int without causing aliasing issues */ +#define pl_deref_int(p) ({ volatile unsigned int *__pl_i = (unsigned int *)(p); *__pl_i; }) + +/* This function waits for <lock> to release all bits covered by <mask>, and + * enforces an exponential backoff using CPU pauses to limit the pollution to + * the other threads' caches. The progression follows (1.5^N)-1, limited to + * 16384 iterations, which is way sufficient even for very large numbers of + * threads. It's possible to disable exponential backoff (EBO) for debugging + * purposes by setting PLOCK_DISABLE_EBO, in which case the function will be + * replaced with a simpler macro. This may for example be useful to more + * easily track callers' CPU usage. The macro was not designed to be used + * outside of the functions defined here. + */ +#if defined(PLOCK_DISABLE_EBO) +#define pl_wait_unlock_long(lock, mask) \ + ({ \ + unsigned long _r; \ + do { \ + pl_cpu_relax(); \ + _r = pl_deref_long(lock); \ + } while (_r & mask); \ + _r; /* return value */ \ + }) +#else /* not PLOCK_DISABLE_EBO */ +__attribute__((unused,always_inline,no_instrument_function)) inline +static unsigned long __pl_wait_unlock_long(const unsigned long *lock, const unsigned long mask) +{ + unsigned long ret; + unsigned int m = 0; + + do { + unsigned int loops = m; + +#ifdef _POSIX_PRIORITY_SCHEDULING + if (loops >= 65536) { + sched_yield(); + loops -= 32768; + } +#endif + for (; loops >= 60; loops --) + pl_cpu_relax(); + + for (; loops >= 1; loops--) + pl_barrier(); + + ret = pl_load(lock); + if (__builtin_expect(ret & mask, 0) == 0) + break; + + /* the below produces an exponential growth with loops to lower + * values and still growing. This allows competing threads to + * wait different times once the threshold is reached. + */ + m = ((m + (m >> 1)) + 2) & 0x3ffff; + } while (1); + + return ret; +} + +# if defined(PLOCK_INLINE_EBO) +__attribute__((unused,always_inline,no_instrument_function)) inline +# else +__attribute__((unused,noinline,no_instrument_function)) +# endif +static unsigned long pl_wait_unlock_long(const unsigned long *lock, const unsigned long mask) +{ + return __pl_wait_unlock_long(lock, mask); +} +#endif /* PLOCK_DISABLE_EBO */ + +/* This function waits for <lock> to release all bits covered by <mask>, and + * enforces an exponential backoff using CPU pauses to limit the pollution to + * the other threads' caches. The progression follows (2^N)-1, limited to 255 + * iterations, which is way sufficient even for very large numbers of threads. + * The function slightly benefits from size optimization under gcc, but Clang + * cannot do it, so it's not done here, as it doesn't make a big difference. + * It is possible to disable exponential backoff (EBO) for debugging purposes + * by setting PLOCK_DISABLE_EBO, in which case the function will be replaced + * with a simpler macro. This may for example be useful to more easily track + * callers' CPU usage. The macro was not designed to be used outside of the + * functions defined here. + */ +#if defined(PLOCK_DISABLE_EBO) +#define pl_wait_unlock_int(lock, mask) \ + ({ \ + unsigned int _r; \ + do { \ + pl_cpu_relax(); \ + _r = pl_deref_int(lock); \ + } while (_r & mask); \ + _r; /* return value */ \ + }) +#else +__attribute__((unused,always_inline,no_instrument_function)) inline +static unsigned int __pl_wait_unlock_int(const unsigned int *lock, const unsigned int mask) +{ + unsigned int ret; + unsigned int m = 0; + + do { + unsigned int loops = m; + +#ifdef _POSIX_PRIORITY_SCHEDULING + if (loops >= 65536) { + sched_yield(); + loops -= 32768; + } +#endif + for (; loops >= 200; loops -= 10) + pl_cpu_relax(); + + for (; loops >= 1; loops--) + pl_barrier(); + + ret = pl_deref_int(lock); + if (__builtin_expect(ret & mask, 0) == 0) + break; + + /* the below produces an exponential growth with loops to lower + * values and still growing. This allows competing threads to + * wait different times once the threshold is reached. + */ + m = ((m + (m >> 1)) + 2) & 0x3ffff; + } while (1); + + return ret; +} + +# if defined(PLOCK_INLINE_EBO) +__attribute__((unused,always_inline,no_instrument_function)) inline +# else +__attribute__((unused,noinline,no_instrument_function)) +# endif +static unsigned int pl_wait_unlock_int(const unsigned int *lock, const unsigned int mask) +{ + return __pl_wait_unlock_int(lock, mask); +} +#endif /* PLOCK_DISABLE_EBO */ + +/* This function waits for <lock> to change from value <prev> and returns the + * new value. It enforces an exponential backoff using CPU pauses to limit the + * pollution to the other threads' caches. The progression follows (2^N)-1, + * limited to 255 iterations, which is way sufficient even for very large + * numbers of threads. It is designed to be called after a first test which + * retrieves the previous value, so it starts by waiting. The function slightly + * benefits from size optimization under gcc, but Clang cannot do it, so it's + * not done here, as it doesn't make a big difference. + */ +__attribute__((unused,noinline,no_instrument_function)) +static unsigned long pl_wait_new_long(const unsigned long *lock, const unsigned long prev) +{ + unsigned char m = 0; + unsigned long curr; + + do { + unsigned char loops = m + 1; + m = (m << 1) + 1; + do { + pl_cpu_relax(); + } while (__builtin_expect(--loops, 0)); + curr = pl_deref_long(lock); + } while (__builtin_expect(curr == prev, 0)); + return curr; +} + +/* This function waits for <lock> to change from value <prev> and returns the + * new value. It enforces an exponential backoff using CPU pauses to limit the + * pollution to the other threads' caches. The progression follows (2^N)-1, + * limited to 255 iterations, which is way sufficient even for very large + * numbers of threads. It is designed to be called after a first test which + * retrieves the previous value, so it starts by waiting. The function slightly + * benefits from size optimization under gcc, but Clang cannot do it, so it's + * not done here, as it doesn't make a big difference. + */ +__attribute__((unused,noinline,no_instrument_function)) +static unsigned int pl_wait_new_int(const unsigned int *lock, const unsigned int prev) +{ + unsigned char m = 0; + unsigned int curr; + + do { + unsigned char loops = m + 1; + m = (m << 1) + 1; + do { + pl_cpu_relax(); + } while (__builtin_expect(--loops, 0)); + curr = pl_deref_int(lock); + } while (__builtin_expect(curr == prev, 0)); + return curr; +} + +/* request shared read access (R), return non-zero on success, otherwise 0 */ +#define pl_try_r(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r = pl_deref_long(lock) & PLOCK64_WL_ANY; \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r, 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK64_RL_1) & PLOCK64_WL_ANY; \ + if (__builtin_expect(__pl_r, 0)) \ + pl_sub_noret((lock), PLOCK64_RL_1); \ + } \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r = pl_deref_int(lock) & PLOCK32_WL_ANY; \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r, 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK32_RL_1) & PLOCK32_WL_ANY; \ + if (__builtin_expect(__pl_r, 0)) \ + pl_sub_noret((lock), PLOCK32_RL_1); \ + } \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_r__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_r__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* request shared read access (R) and wait for it. In order not to disturb a W + * lock waiting for all readers to leave, we first check if a W lock is held + * before trying to claim the R lock. + */ +#define pl_take_r(lock) \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_RL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY; \ + register unsigned long __old_r = pl_cmpxchg(__lk_r, 0, __set_r); \ + if (__old_r) { \ + while (1) { \ + if (__old_r & __msk_r) \ + pl_wait_unlock_long(__lk_r, __msk_r); \ + if (!(pl_ldadd_acq(__lk_r, __set_r) & __msk_r)) \ + break; \ + __old_r = pl_sub_lax(__lk_r, __set_r); \ + } \ + } \ + pl_barrier(); \ + 0; \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_RL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY; \ + register unsigned int __old_r = pl_cmpxchg(__lk_r, 0, __set_r); \ + if (__old_r) { \ + while (1) { \ + if (__old_r & __msk_r) \ + pl_wait_unlock_int(__lk_r, __msk_r); \ + if (!(pl_ldadd_acq(__lk_r, __set_r) & __msk_r)) \ + break; \ + __old_r = pl_sub_lax(__lk_r, __set_r); \ + } \ + } \ + pl_barrier(); \ + 0; \ + }) : ({ \ + void __unsupported_argument_size_for_pl_take_r__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_take_r__(__FILE__,__LINE__); \ + 0; \ + }) + +/* release the read access (R) lock */ +#define pl_drop_r(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK64_RL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK32_RL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_drop_r__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_drop_r__(__FILE__,__LINE__); \ + }) \ +) + +/* request a seek access (S), return non-zero on success, otherwise 0 */ +#define pl_try_s(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r = pl_deref_long(lock); \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r & (PLOCK64_WL_ANY | PLOCK64_SL_ANY), 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK64_SL_1 | PLOCK64_RL_1) & \ + (PLOCK64_WL_ANY | PLOCK64_SL_ANY); \ + if (__builtin_expect(__pl_r, 0)) \ + pl_sub_noret_lax((lock), PLOCK64_SL_1 | PLOCK64_RL_1); \ + } \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r = pl_deref_int(lock); \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r & (PLOCK32_WL_ANY | PLOCK32_SL_ANY), 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK32_SL_1 | PLOCK32_RL_1) & \ + (PLOCK32_WL_ANY | PLOCK32_SL_ANY); \ + if (__builtin_expect(__pl_r, 0)) \ + pl_sub_noret_lax((lock), PLOCK32_SL_1 | PLOCK32_RL_1); \ + } \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_s__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_s__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* request a seek access (S) and wait for it. The lock is immediately claimed, + * and only upon failure an exponential backoff is used. S locks rarely compete + * with W locks so S will generally not disturb W. As the S lock may be used as + * a spinlock, it's important to grab it as fast as possible. + */ +#define pl_take_s(lock) \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_SL_1 | PLOCK64_RL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY | PLOCK64_SL_ANY; \ + while (1) { \ + if (!__builtin_expect(pl_ldadd_acq(__lk_r, __set_r) & __msk_r, 0)) \ + break; \ + pl_sub_noret_lax(__lk_r, __set_r); \ + pl_wait_unlock_long(__lk_r, __msk_r); \ + } \ + pl_barrier(); \ + 0; \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_SL_1 | PLOCK32_RL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY | PLOCK32_SL_ANY; \ + while (1) { \ + if (!__builtin_expect(pl_ldadd_acq(__lk_r, __set_r) & __msk_r, 0)) \ + break; \ + pl_sub_noret_lax(__lk_r, __set_r); \ + pl_wait_unlock_int(__lk_r, __msk_r); \ + } \ + pl_barrier(); \ + 0; \ + }) : ({ \ + void __unsupported_argument_size_for_pl_take_s__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_take_s__(__FILE__,__LINE__); \ + 0; \ + }) + +/* release the seek access (S) lock */ +#define pl_drop_s(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK64_SL_1 + PLOCK64_RL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK32_SL_1 + PLOCK32_RL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_drop_s__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_drop_s__(__FILE__,__LINE__); \ + }) \ +) + +/* drop the S lock and go back to the R lock */ +#define pl_stor(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret(lock, PLOCK64_SL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret(lock, PLOCK32_SL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_stor__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_stor__(__FILE__,__LINE__); \ + }) \ +) + +/* take the W lock under the S lock */ +#define pl_stow(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r = pl_ldadd((lock), PLOCK64_WL_1); \ + if (__pl_r & (PLOCK64_RL_ANY & ~PLOCK64_RL_1)) \ + __pl_r = pl_wait_unlock_long((const unsigned long*)lock, (PLOCK64_RL_ANY & ~PLOCK64_RL_1)); \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r = pl_ldadd((lock), PLOCK32_WL_1); \ + if (__pl_r & (PLOCK32_RL_ANY & ~PLOCK32_RL_1)) \ + __pl_r = pl_wait_unlock_int((const unsigned int*)lock, (PLOCK32_RL_ANY & ~PLOCK32_RL_1)); \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_stow__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_stow__(__FILE__,__LINE__); \ + }) \ +) + +/* drop the W lock and go back to the S lock */ +#define pl_wtos(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret(lock, PLOCK64_WL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret(lock, PLOCK32_WL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_wtos__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_wtos__(__FILE__,__LINE__); \ + }) \ +) + +/* drop the W lock and go back to the R lock */ +#define pl_wtor(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret(lock, PLOCK64_WL_1 | PLOCK64_SL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret(lock, PLOCK32_WL_1 | PLOCK32_SL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_wtor__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_wtor__(__FILE__,__LINE__); \ + }) \ +) + +/* request a write access (W), return non-zero on success, otherwise 0. + * + * Below there is something important : by taking both W and S, we will cause + * an overflow of W at 4/5 of the maximum value that can be stored into W due + * to the fact that S is 2 bits, so we're effectively adding 5 to the word + * composed by W:S. But for all words multiple of 4 bits, the maximum value is + * multiple of 15 thus of 5. So the largest value we can store with all bits + * set to one will be met by adding 5, and then adding 5 again will place value + * 1 in W and value 0 in S, so we never leave W with 0. Also, even upon such an + * overflow, there's no risk to confuse it with an atomic lock because R is not + * null since it will not have overflown. For 32-bit locks, this situation + * happens when exactly 13108 threads try to grab the lock at once, W=1, S=0 + * and R=13108. For 64-bit locks, it happens at 858993460 concurrent writers + * where W=1, S=0 and R=858993460. + */ +#define pl_try_w(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r = pl_deref_long(lock); \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r & (PLOCK64_WL_ANY | PLOCK64_SL_ANY), 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK64_WL_1 | PLOCK64_SL_1 | PLOCK64_RL_1);\ + if (__builtin_expect(__pl_r & (PLOCK64_WL_ANY | PLOCK64_SL_ANY), 0)) { \ + /* a writer, seeker or atomic is present, let's leave */ \ + pl_sub_noret_lax((lock), PLOCK64_WL_1 | PLOCK64_SL_1 | PLOCK64_RL_1);\ + __pl_r &= (PLOCK64_WL_ANY | PLOCK64_SL_ANY); /* return value */\ + } else { \ + /* wait for all other readers to leave */ \ + while (__pl_r) \ + __pl_r = pl_deref_long(lock) - \ + (PLOCK64_WL_1 | PLOCK64_SL_1 | PLOCK64_RL_1); \ + } \ + } \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r = pl_deref_int(lock); \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r & (PLOCK32_WL_ANY | PLOCK32_SL_ANY), 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK32_WL_1 | PLOCK32_SL_1 | PLOCK32_RL_1);\ + if (__builtin_expect(__pl_r & (PLOCK32_WL_ANY | PLOCK32_SL_ANY), 0)) { \ + /* a writer, seeker or atomic is present, let's leave */ \ + pl_sub_noret_lax((lock), PLOCK32_WL_1 | PLOCK32_SL_1 | PLOCK32_RL_1);\ + __pl_r &= (PLOCK32_WL_ANY | PLOCK32_SL_ANY); /* return value */\ + } else { \ + /* wait for all other readers to leave */ \ + while (__pl_r) \ + __pl_r = pl_deref_int(lock) - \ + (PLOCK32_WL_1 | PLOCK32_SL_1 | PLOCK32_RL_1); \ + } \ + } \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_w__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_w__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* request a write access (W) and wait for it. The lock is immediately claimed, + * and only upon failure an exponential backoff is used. + */ +#define pl_take_w(lock) \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_WL_1 | PLOCK64_SL_1 | PLOCK64_RL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY | PLOCK64_SL_ANY; \ + register unsigned long __pl_r; \ + while (1) { \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + if (!__builtin_expect(__pl_r & __msk_r, 0)) \ + break; \ + pl_sub_noret_lax(__lk_r, __set_r); \ + __pl_r = pl_wait_unlock_long(__lk_r, __msk_r); \ + } \ + /* wait for all other readers to leave */ \ + if (__builtin_expect(__pl_r & PLOCK64_RL_ANY, 0)) \ + __pl_r = pl_wait_unlock_long(__lk_r, (PLOCK64_RL_ANY & ~PLOCK64_RL_1)) - __set_r; \ + pl_barrier(); \ + 0; \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_WL_1 | PLOCK32_SL_1 | PLOCK32_RL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY | PLOCK32_SL_ANY; \ + register unsigned int __pl_r; \ + while (1) { \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + if (!__builtin_expect(__pl_r & __msk_r, 0)) \ + break; \ + pl_sub_noret_lax(__lk_r, __set_r); \ + __pl_r = pl_wait_unlock_int(__lk_r, __msk_r); \ + } \ + /* wait for all other readers to leave */ \ + if (__builtin_expect(__pl_r & PLOCK32_RL_ANY, 0)) \ + __pl_r = pl_wait_unlock_int(__lk_r, (PLOCK32_RL_ANY & ~PLOCK32_RL_1)) - __set_r; \ + pl_barrier(); \ + 0; \ + }) : ({ \ + void __unsupported_argument_size_for_pl_take_w__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_take_w__(__FILE__,__LINE__); \ + 0; \ + }) + +/* drop the write (W) lock entirely */ +#define pl_drop_w(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK64_WL_1 | PLOCK64_SL_1 | PLOCK64_RL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK32_WL_1 | PLOCK32_SL_1 | PLOCK32_RL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_drop_w__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_drop_w__(__FILE__,__LINE__); \ + }) \ +) + +/* Try to upgrade from R to S, return non-zero on success, otherwise 0. + * This lock will fail if S or W are already held. In case of failure to grab + * the lock, it MUST NOT be retried without first dropping R, or it may never + * complete due to S waiting for R to leave before upgrading to W. + */ +#define pl_try_rtos(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r; \ + __pl_r = pl_ldadd_acq((lock), PLOCK64_SL_1) & (PLOCK64_WL_ANY | PLOCK64_SL_ANY);\ + if (__builtin_expect(__pl_r, 0)) \ + pl_sub_noret_lax((lock), PLOCK64_SL_1); \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r; \ + __pl_r = pl_ldadd_acq((lock), PLOCK32_SL_1) & (PLOCK32_WL_ANY | PLOCK32_SL_ANY);\ + if (__builtin_expect(__pl_r, 0)) \ + pl_sub_noret_lax((lock), PLOCK32_SL_1); \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_rtos__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_rtos__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + + +/* Try to upgrade from R to W, return non-zero on success, otherwise 0. + * This lock will fail if S or W are already held. In case of failure to grab + * the lock, it MUST NOT be retried without first dropping R, or it may never + * complete due to S waiting for R to leave before upgrading to W. It waits for + * the last readers to leave. + */ +#define pl_try_rtow(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_WL_1 | PLOCK64_SL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY | PLOCK64_SL_ANY; \ + register unsigned long __pl_r; \ + pl_barrier(); \ + while (1) { \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + if (__builtin_expect(__pl_r & __msk_r, 0)) { \ + if (pl_ldadd_lax(__lk_r, - __set_r)) \ + break; /* the caller needs to drop the lock now */ \ + continue; /* lock was released, try again */ \ + } \ + /* ok we're the only writer, wait for readers to leave */ \ + while (__builtin_expect(__pl_r, 0)) \ + __pl_r = pl_deref_long(__lk_r) - (PLOCK64_WL_1|PLOCK64_SL_1|PLOCK64_RL_1); \ + /* now return with __pl_r = 0 */ \ + break; \ + } \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_WL_1 | PLOCK32_SL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY | PLOCK32_SL_ANY; \ + register unsigned int __pl_r; \ + pl_barrier(); \ + while (1) { \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + if (__builtin_expect(__pl_r & __msk_r, 0)) { \ + if (pl_ldadd_lax(__lk_r, - __set_r)) \ + break; /* the caller needs to drop the lock now */ \ + continue; /* lock was released, try again */ \ + } \ + /* ok we're the only writer, wait for readers to leave */ \ + while (__builtin_expect(__pl_r, 0)) \ + __pl_r = pl_deref_int(__lk_r) - (PLOCK32_WL_1|PLOCK32_SL_1|PLOCK32_RL_1); \ + /* now return with __pl_r = 0 */ \ + break; \ + } \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_rtow__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_rtow__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + + +/* request atomic write access (A), return non-zero on success, otherwise 0. + * It's a bit tricky as we only use the W bits for this and want to distinguish + * between other atomic users and regular lock users. We have to give up if an + * S lock appears. It's possible that such a lock stays hidden in the W bits + * after an overflow, but in this case R is still held, ensuring we stay in the + * loop until we discover the conflict. The lock only return successfully if all + * readers are gone (or converted to A). + */ +#define pl_try_a(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r = pl_deref_long(lock) & PLOCK64_SL_ANY; \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r, 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK64_WL_1); \ + while (1) { \ + if (__builtin_expect(__pl_r & PLOCK64_SL_ANY, 0)) { \ + pl_sub_noret_lax((lock), PLOCK64_WL_1); \ + break; /* return !__pl_r */ \ + } \ + __pl_r &= PLOCK64_RL_ANY; \ + if (!__builtin_expect(__pl_r, 0)) \ + break; /* return !__pl_r */ \ + __pl_r = pl_deref_long(lock); \ + } \ + } \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r = pl_deref_int(lock) & PLOCK32_SL_ANY; \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r, 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK32_WL_1); \ + while (1) { \ + if (__builtin_expect(__pl_r & PLOCK32_SL_ANY, 0)) { \ + pl_sub_noret_lax((lock), PLOCK32_WL_1); \ + break; /* return !__pl_r */ \ + } \ + __pl_r &= PLOCK32_RL_ANY; \ + if (!__builtin_expect(__pl_r, 0)) \ + break; /* return !__pl_r */ \ + __pl_r = pl_deref_int(lock); \ + } \ + } \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_a__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_a__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* request atomic write access (A) and wait for it. See comments in pl_try_a() for + * explanations. + */ +#define pl_take_a(lock) \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_WL_1; \ + register unsigned long __msk_r = PLOCK64_SL_ANY; \ + register unsigned long __pl_r; \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + while (__builtin_expect(__pl_r & PLOCK64_RL_ANY, 0)) { \ + if (__builtin_expect(__pl_r & __msk_r, 0)) { \ + pl_sub_noret_lax(__lk_r, __set_r); \ + pl_wait_unlock_long(__lk_r, __msk_r); \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + continue; \ + } \ + /* wait for all readers to leave or upgrade */ \ + pl_cpu_relax(); pl_cpu_relax(); pl_cpu_relax(); \ + __pl_r = pl_deref_long(lock); \ + } \ + pl_barrier(); \ + 0; \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_WL_1; \ + register unsigned int __msk_r = PLOCK32_SL_ANY; \ + register unsigned int __pl_r; \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + while (__builtin_expect(__pl_r & PLOCK32_RL_ANY, 0)) { \ + if (__builtin_expect(__pl_r & __msk_r, 0)) { \ + pl_sub_noret_lax(__lk_r, __set_r); \ + pl_wait_unlock_int(__lk_r, __msk_r); \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r); \ + continue; \ + } \ + /* wait for all readers to leave or upgrade */ \ + pl_cpu_relax(); pl_cpu_relax(); pl_cpu_relax(); \ + __pl_r = pl_deref_int(lock); \ + } \ + pl_barrier(); \ + 0; \ + }) : ({ \ + void __unsupported_argument_size_for_pl_take_a__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_take_a__(__FILE__,__LINE__); \ + 0; \ + }) + +/* release atomic write access (A) lock */ +#define pl_drop_a(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK64_WL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK32_WL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_drop_a__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_drop_a__(__FILE__,__LINE__); \ + }) \ +) + +/* Downgrade A to R. Inc(R), dec(W) then wait for W==0 */ +#define pl_ator(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_RL_1 - PLOCK64_WL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY; \ + register unsigned long __pl_r = pl_ldadd(__lk_r, __set_r) + __set_r; \ + while (__builtin_expect(__pl_r & __msk_r, 0)) { \ + __pl_r = pl_wait_unlock_long(__lk_r, __msk_r); \ + } \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_RL_1 - PLOCK32_WL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY; \ + register unsigned int __pl_r = pl_ldadd(__lk_r, __set_r) + __set_r; \ + while (__builtin_expect(__pl_r & __msk_r, 0)) { \ + __pl_r = pl_wait_unlock_int(__lk_r, __msk_r); \ + } \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_ator__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_ator__(__FILE__,__LINE__); \ + }) \ +) + +/* Try to upgrade from R to A, return non-zero on success, otherwise 0. + * This lock will fail if S is held or appears while waiting (typically due to + * a previous grab that was disguised as a W due to an overflow). In case of + * failure to grab the lock, it MUST NOT be retried without first dropping R, + * or it may never complete due to S waiting for R to leave before upgrading + * to W. The lock succeeds once there's no more R (ie all of them have either + * completed or were turned to A). + */ +#define pl_try_rtoa(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long __pl_r = pl_deref_long(lock) & PLOCK64_SL_ANY; \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r, 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK64_WL_1 - PLOCK64_RL_1); \ + while (1) { \ + if (__builtin_expect(__pl_r & PLOCK64_SL_ANY, 0)) { \ + pl_sub_noret_lax((lock), PLOCK64_WL_1 - PLOCK64_RL_1); \ + break; /* return !__pl_r */ \ + } \ + __pl_r &= PLOCK64_RL_ANY; \ + if (!__builtin_expect(__pl_r, 0)) \ + break; /* return !__pl_r */ \ + __pl_r = pl_deref_long(lock); \ + } \ + } \ + !__pl_r; /* return value */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int __pl_r = pl_deref_int(lock) & PLOCK32_SL_ANY; \ + pl_barrier(); \ + if (!__builtin_expect(__pl_r, 0)) { \ + __pl_r = pl_ldadd_acq((lock), PLOCK32_WL_1 - PLOCK32_RL_1); \ + while (1) { \ + if (__builtin_expect(__pl_r & PLOCK32_SL_ANY, 0)) { \ + pl_sub_noret_lax((lock), PLOCK32_WL_1 - PLOCK32_RL_1); \ + break; /* return !__pl_r */ \ + } \ + __pl_r &= PLOCK32_RL_ANY; \ + if (!__builtin_expect(__pl_r, 0)) \ + break; /* return !__pl_r */ \ + __pl_r = pl_deref_int(lock); \ + } \ + } \ + !__pl_r; /* return value */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_rtoa__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_rtoa__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + + +/* + * The following operations cover the multiple writers model : U->R->J->C->A + */ + + +/* Upgrade R to J. Inc(W) then wait for R==W or S != 0 */ +#define pl_rtoj(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __pl_r = pl_ldadd_acq(__lk_r, PLOCK64_WL_1) + PLOCK64_WL_1;\ + register unsigned char __m = 0; \ + while (!(__pl_r & PLOCK64_SL_ANY) && \ + (__pl_r / PLOCK64_WL_1 != (__pl_r & PLOCK64_RL_ANY) / PLOCK64_RL_1)) { \ + unsigned char __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_long(__lk_r); \ + } \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __pl_r = pl_ldadd_acq(__lk_r, PLOCK32_WL_1) + PLOCK32_WL_1;\ + register unsigned char __m = 0; \ + while (!(__pl_r & PLOCK32_SL_ANY) && \ + (__pl_r / PLOCK32_WL_1 != (__pl_r & PLOCK32_RL_ANY) / PLOCK32_RL_1)) { \ + unsigned char __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_int(__lk_r); \ + } \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_rtoj__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_rtoj__(__FILE__,__LINE__); \ + }) \ +) + +/* Upgrade J to C. Set S. Only one thread needs to do it though it's idempotent */ +#define pl_jtoc(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __pl_r = pl_deref_long(__lk_r); \ + if (!(__pl_r & PLOCK64_SL_ANY)) \ + pl_or_noret(__lk_r, PLOCK64_SL_1); \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __pl_r = pl_deref_int(__lk_r); \ + if (!(__pl_r & PLOCK32_SL_ANY)) \ + pl_or_noret(__lk_r, PLOCK32_SL_1); \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_jtoc__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_jtoc__(__FILE__,__LINE__); \ + }) \ +) + +/* Upgrade R to C. Inc(W) then wait for R==W or S != 0 */ +#define pl_rtoc(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __pl_r = pl_ldadd_acq(__lk_r, PLOCK64_WL_1) + PLOCK64_WL_1;\ + register unsigned char __m = 0; \ + while (__builtin_expect(!(__pl_r & PLOCK64_SL_ANY), 0)) { \ + unsigned char __loops; \ + if (__pl_r / PLOCK64_WL_1 == (__pl_r & PLOCK64_RL_ANY) / PLOCK64_RL_1) { \ + pl_or_noret(__lk_r, PLOCK64_SL_1); \ + break; \ + } \ + __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_long(__lk_r); \ + } \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __pl_r = pl_ldadd_acq(__lk_r, PLOCK32_WL_1) + PLOCK32_WL_1;\ + register unsigned char __m = 0; \ + while (__builtin_expect(!(__pl_r & PLOCK32_SL_ANY), 0)) { \ + unsigned char __loops; \ + if (__pl_r / PLOCK32_WL_1 == (__pl_r & PLOCK32_RL_ANY) / PLOCK32_RL_1) { \ + pl_or_noret(__lk_r, PLOCK32_SL_1); \ + break; \ + } \ + __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_int(__lk_r); \ + } \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_rtoj__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_rtoj__(__FILE__,__LINE__); \ + }) \ +) + +/* Drop the claim (C) lock : R--,W-- then clear S if !R */ +#define pl_drop_c(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = - PLOCK64_RL_1 - PLOCK64_WL_1; \ + register unsigned long __pl_r = pl_ldadd(__lk_r, __set_r) + __set_r; \ + if (!(__pl_r & PLOCK64_RL_ANY)) \ + pl_and_noret(__lk_r, ~PLOCK64_SL_1); \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = - PLOCK32_RL_1 - PLOCK32_WL_1; \ + register unsigned int __pl_r = pl_ldadd(__lk_r, __set_r) + __set_r; \ + if (!(__pl_r & PLOCK32_RL_ANY)) \ + pl_and_noret(__lk_r, ~PLOCK32_SL_1); \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_drop_c__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_drop_c__(__FILE__,__LINE__); \ + }) \ +) + +/* Upgrade C to A. R-- then wait for !S or clear S if !R */ +#define pl_ctoa(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __pl_r = pl_ldadd(__lk_r, -PLOCK64_RL_1) - PLOCK64_RL_1;\ + while (__pl_r & PLOCK64_SL_ANY) { \ + if (!(__pl_r & PLOCK64_RL_ANY)) { \ + pl_and_noret(__lk_r, ~PLOCK64_SL_1); \ + break; \ + } \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + __pl_r = pl_deref_long(__lk_r); \ + } \ + pl_barrier(); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __pl_r = pl_ldadd(__lk_r, -PLOCK32_RL_1) - PLOCK32_RL_1; \ + while (__pl_r & PLOCK32_SL_ANY) { \ + if (!(__pl_r & PLOCK32_RL_ANY)) { \ + pl_and_noret(__lk_r, ~PLOCK32_SL_1); \ + break; \ + } \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + __pl_r = pl_deref_int(__lk_r); \ + } \ + pl_barrier(); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_ctoa__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_ctoa__(__FILE__,__LINE__); \ + }) \ +) + +/* downgrade the atomic write access lock (A) to join (J) */ +#define pl_atoj(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_add_noret(lock, PLOCK64_RL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_add_noret(lock, PLOCK32_RL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_atoj__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_atoj__(__FILE__,__LINE__); \ + }) \ +) + +/* Returns non-zero if the thread calling it is the last writer, otherwise zero. It is + * designed to be called before pl_drop_j(), pl_drop_c() or pl_drop_a() for operations + * which need to be called only once. + */ +#define pl_last_writer(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + !(pl_deref_long(lock) & PLOCK64_WL_2PL); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + !(pl_deref_int(lock) & PLOCK32_WL_2PL); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_last_j__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_last_j__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* attempt to get an exclusive write access via the J lock and wait for it. + * Only one thread may succeed in this operation. It will not conflict with + * other users and will first wait for all writers to leave, then for all + * readers to leave before starting. This offers a solution to obtain an + * exclusive access to a shared resource in the R/J/C/A model. A concurrent + * take_a() will wait for this one to finish first. Using a CAS instead of XADD + * should make the operation converge slightly faster. Returns non-zero on + * success otherwise 0. + */ +#define pl_try_j(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_WL_1 | PLOCK64_RL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY; \ + register unsigned long __pl_r; \ + register unsigned char __m; \ + pl_wait_unlock_long(__lk_r, __msk_r); \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r) + __set_r; \ + /* wait for all other readers to leave */ \ + __m = 0; \ + while (__builtin_expect(__pl_r & PLOCK64_RL_2PL, 0)) { \ + unsigned char __loops; \ + /* give up on other writers */ \ + if (__builtin_expect(__pl_r & PLOCK64_WL_2PL, 0)) { \ + pl_sub_noret_lax(__lk_r, __set_r); \ + __pl_r = 0; /* failed to get the lock */ \ + break; \ + } \ + __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_long(__lk_r); \ + } \ + pl_barrier(); \ + __pl_r; /* return value, cannot be null on success */ \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_WL_1 | PLOCK32_RL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY; \ + register unsigned int __pl_r; \ + register unsigned char __m; \ + pl_wait_unlock_int(__lk_r, __msk_r); \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r) + __set_r; \ + /* wait for all other readers to leave */ \ + __m = 0; \ + while (__builtin_expect(__pl_r & PLOCK32_RL_2PL, 0)) { \ + unsigned char __loops; \ + /* but rollback on other writers */ \ + if (__builtin_expect(__pl_r & PLOCK32_WL_2PL, 0)) { \ + pl_sub_noret_lax(__lk_r, __set_r); \ + __pl_r = 0; /* failed to get the lock */ \ + break; \ + } \ + __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_int(__lk_r); \ + } \ + pl_barrier(); \ + __pl_r; /* return value, cannot be null on success */ \ + }) : ({ \ + void __unsupported_argument_size_for_pl_try_j__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_try_j__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* request an exclusive write access via the J lock and wait for it. Only one + * thread may succeed in this operation. It will not conflict with other users + * and will first wait for all writers to leave, then for all readers to leave + * before starting. This offers a solution to obtain an exclusive access to a + * shared resource in the R/J/C/A model. A concurrent take_a() will wait for + * this one to finish first. Using a CAS instead of XADD should make the + * operation converge slightly faster. + */ +#define pl_take_j(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + __label__ __retry; \ + register unsigned long *__lk_r = (unsigned long *)(lock); \ + register unsigned long __set_r = PLOCK64_WL_1 | PLOCK64_RL_1; \ + register unsigned long __msk_r = PLOCK64_WL_ANY; \ + register unsigned long __pl_r; \ + register unsigned char __m; \ + __retry: \ + pl_wait_unlock_long(__lk_r, __msk_r); \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r) + __set_r; \ + /* wait for all other readers to leave */ \ + __m = 0; \ + while (__builtin_expect(__pl_r & PLOCK64_RL_2PL, 0)) { \ + unsigned char __loops; \ + /* but rollback on other writers */ \ + if (__builtin_expect(__pl_r & PLOCK64_WL_2PL, 0)) { \ + pl_sub_noret_lax(__lk_r, __set_r); \ + goto __retry; \ + } \ + __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_long(__lk_r); \ + } \ + pl_barrier(); \ + 0; \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + __label__ __retry; \ + register unsigned int *__lk_r = (unsigned int *)(lock); \ + register unsigned int __set_r = PLOCK32_WL_1 | PLOCK32_RL_1; \ + register unsigned int __msk_r = PLOCK32_WL_ANY; \ + register unsigned int __pl_r; \ + register unsigned char __m; \ + __retry: \ + pl_wait_unlock_int(__lk_r, __msk_r); \ + __pl_r = pl_ldadd_acq(__lk_r, __set_r) + __set_r; \ + /* wait for all other readers to leave */ \ + __m = 0; \ + while (__builtin_expect(__pl_r & PLOCK32_RL_2PL, 0)) { \ + unsigned char __loops; \ + /* but rollback on other writers */ \ + if (__builtin_expect(__pl_r & PLOCK32_WL_2PL, 0)) { \ + pl_sub_noret_lax(__lk_r, __set_r); \ + goto __retry; \ + } \ + __loops = __m + 1; \ + __m = (__m << 1) + 1; \ + do { \ + pl_cpu_relax(); \ + pl_cpu_relax(); \ + } while (--__loops); \ + __pl_r = pl_deref_int(__lk_r); \ + } \ + pl_barrier(); \ + 0; \ + }) : ({ \ + void __unsupported_argument_size_for_pl_take_j__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_take_j__(__FILE__,__LINE__); \ + 0; \ + }) \ +) + +/* drop the join (J) lock entirely */ +#define pl_drop_j(lock) ( \ + (sizeof(long) == 8 && sizeof(*(lock)) == 8) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK64_WL_1 | PLOCK64_RL_1); \ + }) : (sizeof(*(lock)) == 4) ? ({ \ + pl_barrier(); \ + pl_sub_noret_rel(lock, PLOCK32_WL_1 | PLOCK32_RL_1); \ + }) : ({ \ + void __unsupported_argument_size_for_pl_drop_j__(char *,int); \ + if (sizeof(*(lock)) != 4 && (sizeof(long) != 8 || sizeof(*(lock)) != 8)) \ + __unsupported_argument_size_for_pl_drop_j__(__FILE__,__LINE__); \ + }) \ +) + +/* + * The part below is for Low Overhead R/W locks (LORW). These ones are not + * upgradable and not necessarily fair but they try to be fast when uncontended + * and to limit the cost and perturbation during contention. Writers always + * have precedence over readers to preserve latency as much as possible. + * + * The principle is to offer a fast no-contention path and a limited total + * number of writes for the contended path. Since R/W locks are expected to be + * used in situations where there is a benefit in separating reads from writes, + * it is expected that reads are common (typ >= 50%) and that there is often at + * least one reader (otherwise a spinlock wouldn't be a problem). As such, a + * reader will try to pass instantly, detect contention and immediately retract + * and wait in the queue in case there is contention. A writer will first also + * try to pass instantly, and if it fails due to pending readers, it will mark + * that it's waiting so that readers stop entering. This will leave the writer + * waiting as close as possible to the point of being granted access. New + * writers will also notice this previous contention and will wait outside. + * This means that a successful access for a reader or a writer requires a + * single CAS, and a contended attempt will require one failed CAS and one + * successful XADD for a reader, or an optional OR and a N+1 CAS for the + * writer. + * + * A counter of shared users indicates the number of active readers, while a + * (single-bit) counter of exclusive writers indicates whether the lock is + * currently held for writes. This distinction also permits to use a single + * function to release the lock if desired, since the exclusive bit indicates + * the state of the caller of unlock(). The WRQ bit is cleared during the + * unlock. + * + * Layout: (32/64 bit): + * 31 2 1 0 + * +-----------+--------------+-----+-----+ + * | | SHR | WRQ | EXC | + * +-----------+--------------+-----+-----+ + * + * In order to minimize operations, the WRQ bit is held during EXC so that the + * write waiter that had to fight for EXC doesn't have to release WRQ during + * its operations, and will just drop it along with EXC upon unlock. + * + * This means the following costs: + * reader: + * success: 1 CAS + * failure: 1 CAS + 1 XADD + * unlock: 1 SUB + * writer: + * success: 1 RD + 1 CAS + * failure: 1 RD + 1 CAS + 0/1 OR + N CAS + * unlock: 1 AND + */ + +#define PLOCK_LORW_EXC_BIT ((sizeof(long) == 8) ? 0 : 0) +#define PLOCK_LORW_EXC_SIZE ((sizeof(long) == 8) ? 1 : 1) +#define PLOCK_LORW_EXC_BASE (1UL << PLOCK_LORW_EXC_BIT) +#define PLOCK_LORW_EXC_MASK (((1UL << PLOCK_LORW_EXC_SIZE) - 1UL) << PLOCK_LORW_EXC_BIT) + +#define PLOCK_LORW_WRQ_BIT ((sizeof(long) == 8) ? 1 : 1) +#define PLOCK_LORW_WRQ_SIZE ((sizeof(long) == 8) ? 1 : 1) +#define PLOCK_LORW_WRQ_BASE (1UL << PLOCK_LORW_WRQ_BIT) +#define PLOCK_LORW_WRQ_MASK (((1UL << PLOCK_LORW_WRQ_SIZE) - 1UL) << PLOCK_LORW_WRQ_BIT) + +#define PLOCK_LORW_SHR_BIT ((sizeof(long) == 8) ? 2 : 2) +#define PLOCK_LORW_SHR_SIZE ((sizeof(long) == 8) ? 30 : 30) +#define PLOCK_LORW_SHR_BASE (1UL << PLOCK_LORW_SHR_BIT) +#define PLOCK_LORW_SHR_MASK (((1UL << PLOCK_LORW_SHR_SIZE) - 1UL) << PLOCK_LORW_SHR_BIT) + +__attribute__((unused,always_inline,no_instrument_function)) +static inline void pl_lorw_rdlock(unsigned long *lock) +{ + unsigned long lk = 0; + + /* First, assume we're alone and try to get the read lock (fast path). + * It often works because read locks are often used on low-contention + * structs. + */ + lk = pl_cmpxchg(lock, 0, PLOCK_LORW_SHR_BASE); + if (!lk) + return; + + /* so we were not alone, make sure there's no writer waiting for the + * lock to be empty of visitors. + */ + if (lk & PLOCK_LORW_WRQ_MASK) +#if defined(PLOCK_LORW_INLINE_WAIT) && !defined(PLOCK_DISABLE_EBO) + lk = __pl_wait_unlock_long(lock, PLOCK_LORW_WRQ_MASK); +#else + lk = pl_wait_unlock_long(lock, PLOCK_LORW_WRQ_MASK); +#endif + + /* count us as visitor among others */ + lk = pl_ldadd_acq(lock, PLOCK_LORW_SHR_BASE); + + /* wait for end of exclusive access if any */ + if (lk & PLOCK_LORW_EXC_MASK) +#if defined(PLOCK_LORW_INLINE_WAIT) && !defined(PLOCK_DISABLE_EBO) + lk = __pl_wait_unlock_long(lock, PLOCK_LORW_EXC_MASK); +#else + lk = pl_wait_unlock_long(lock, PLOCK_LORW_EXC_MASK); +#endif +} + + +__attribute__((unused,always_inline,no_instrument_function)) +static inline void pl_lorw_wrlock(unsigned long *lock) +{ + unsigned long lk = 0; + unsigned long old = 0; + + /* first, make sure another writer is not already blocked waiting for + * readers to leave. Note that tests have shown that it can be even + * faster to avoid the first check and to unconditionally wait. + */ + lk = pl_deref_long(lock); + if (__builtin_expect(lk & PLOCK_LORW_WRQ_MASK, 1)) +#if defined(PLOCK_LORW_INLINE_WAIT) && !defined(PLOCK_DISABLE_EBO) + lk = __pl_wait_unlock_long(lock, PLOCK_LORW_WRQ_MASK); +#else + lk = pl_wait_unlock_long(lock, PLOCK_LORW_WRQ_MASK); +#endif + + do { + /* let's check for the two sources of contention at once */ + + if (__builtin_expect(lk & (PLOCK_LORW_SHR_MASK | PLOCK_LORW_EXC_MASK), 1)) { + /* check if there are still readers coming. If so, close the door and + * wait for them to leave. + */ + if (lk & PLOCK_LORW_SHR_MASK) { + /* note below, an OR is significantly cheaper than BTS or XADD */ + if (!(lk & PLOCK_LORW_WRQ_MASK)) + pl_or_noret(lock, PLOCK_LORW_WRQ_BASE); +#if defined(PLOCK_LORW_INLINE_WAIT) && !defined(PLOCK_DISABLE_EBO) + lk = __pl_wait_unlock_long(lock, PLOCK_LORW_SHR_MASK); +#else + lk = pl_wait_unlock_long(lock, PLOCK_LORW_SHR_MASK); +#endif + } + + /* And also wait for a previous writer to finish. */ + if (lk & PLOCK_LORW_EXC_MASK) +#if defined(PLOCK_LORW_INLINE_WAIT) && !defined(PLOCK_DISABLE_EBO) + lk = __pl_wait_unlock_long(lock, PLOCK_LORW_EXC_MASK); +#else + lk = pl_wait_unlock_long(lock, PLOCK_LORW_EXC_MASK); +#endif + } + + /* A fresh new reader may appear right now if there were none + * above and we didn't close the door. + */ + old = lk & ~PLOCK_LORW_SHR_MASK & ~PLOCK_LORW_EXC_MASK; + lk = pl_cmpxchg(lock, old, old | PLOCK_LORW_EXC_BASE); + } while (lk != old); + + /* done, not waiting anymore, the WRQ bit if any, will be dropped by the + * unlock + */ +} + + +__attribute__((unused,always_inline,no_instrument_function)) +static inline void pl_lorw_rdunlock(unsigned long *lock) +{ + pl_sub_noret_rel(lock, PLOCK_LORW_SHR_BASE); +} + +__attribute__((unused,always_inline,no_instrument_function)) +static inline void pl_lorw_wrunlock(unsigned long *lock) +{ + pl_and_noret_rel(lock, ~(PLOCK_LORW_WRQ_MASK | PLOCK_LORW_EXC_MASK)); +} + +__attribute__((unused,always_inline,no_instrument_function)) +static inline void pl_lorw_unlock(unsigned long *lock) +{ + if (pl_deref_long(lock) & PLOCK_LORW_EXC_MASK) + pl_lorw_wrunlock(lock); + else + pl_lorw_rdunlock(lock); +} + +#endif /* PL_PLOCK_H */ diff --git a/include/import/sha1.h b/include/import/sha1.h new file mode 100644 index 0000000..33ee530 --- /dev/null +++ b/include/import/sha1.h @@ -0,0 +1,35 @@ +/* + * Based on the git SHA1 Implementation. + * + * Copyright (C) 2009-2015, Linus Torvalds and others. + * + * SHA1 routine optimized to do word accesses rather than byte accesses, + * and to avoid unnecessary copies into the context array. + * + * This was initially based on the Mozilla SHA1 implementation, although + * none of the original Mozilla code remains. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation, version 2.1 + * exclusively. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +typedef struct { + unsigned long long size; + unsigned int H[5]; + unsigned int W[16]; +} blk_SHA_CTX; + +void blk_SHA1_Init(blk_SHA_CTX *ctx); +void blk_SHA1_Update(blk_SHA_CTX *ctx, const void *dataIn, unsigned long len); +void blk_SHA1_Final(unsigned char hashout[20], blk_SHA_CTX *ctx); diff --git a/include/import/slz-tables.h b/include/import/slz-tables.h new file mode 100644 index 0000000..0b3a5b9 --- /dev/null +++ b/include/import/slz-tables.h @@ -0,0 +1,257 @@ +/* Fixed Huffman table as per RFC1951. + * + * Lit Value Bits Codes + * --------- ---- ----- + * 0 - 143 8 00110000 through 10111111 + * 144 - 255 9 110010000 through 111111111 + * 256 - 279 7 0000000 through 0010111 + * 280 - 287 8 11000000 through 11000111 + * + * The codes are encoded in reverse, the high bit of the code appears encoded + * as bit 0. The table is built by mkhuff.sh. The 16 bits are encoded this way : + * - bits 0..3 : bits + * - bits 4..12 : code + */ +static const uint16_t fixed_huff[288] = { + 0x00c8, 0x08c8, 0x04c8, 0x0cc8, 0x02c8, 0x0ac8, 0x06c8, 0x0ec8, // 0 + 0x01c8, 0x09c8, 0x05c8, 0x0dc8, 0x03c8, 0x0bc8, 0x07c8, 0x0fc8, // 8 + 0x0028, 0x0828, 0x0428, 0x0c28, 0x0228, 0x0a28, 0x0628, 0x0e28, // 16 + 0x0128, 0x0928, 0x0528, 0x0d28, 0x0328, 0x0b28, 0x0728, 0x0f28, // 24 + 0x00a8, 0x08a8, 0x04a8, 0x0ca8, 0x02a8, 0x0aa8, 0x06a8, 0x0ea8, // 32 + 0x01a8, 0x09a8, 0x05a8, 0x0da8, 0x03a8, 0x0ba8, 0x07a8, 0x0fa8, // 40 + 0x0068, 0x0868, 0x0468, 0x0c68, 0x0268, 0x0a68, 0x0668, 0x0e68, // 48 + 0x0168, 0x0968, 0x0568, 0x0d68, 0x0368, 0x0b68, 0x0768, 0x0f68, // 56 + 0x00e8, 0x08e8, 0x04e8, 0x0ce8, 0x02e8, 0x0ae8, 0x06e8, 0x0ee8, // 64 + 0x01e8, 0x09e8, 0x05e8, 0x0de8, 0x03e8, 0x0be8, 0x07e8, 0x0fe8, // 72 + 0x0018, 0x0818, 0x0418, 0x0c18, 0x0218, 0x0a18, 0x0618, 0x0e18, // 80 + 0x0118, 0x0918, 0x0518, 0x0d18, 0x0318, 0x0b18, 0x0718, 0x0f18, // 88 + 0x0098, 0x0898, 0x0498, 0x0c98, 0x0298, 0x0a98, 0x0698, 0x0e98, // 96 + 0x0198, 0x0998, 0x0598, 0x0d98, 0x0398, 0x0b98, 0x0798, 0x0f98, // 104 + 0x0058, 0x0858, 0x0458, 0x0c58, 0x0258, 0x0a58, 0x0658, 0x0e58, // 112 + 0x0158, 0x0958, 0x0558, 0x0d58, 0x0358, 0x0b58, 0x0758, 0x0f58, // 120 + 0x00d8, 0x08d8, 0x04d8, 0x0cd8, 0x02d8, 0x0ad8, 0x06d8, 0x0ed8, // 128 + 0x01d8, 0x09d8, 0x05d8, 0x0dd8, 0x03d8, 0x0bd8, 0x07d8, 0x0fd8, // 136 + 0x0139, 0x1139, 0x0939, 0x1939, 0x0539, 0x1539, 0x0d39, 0x1d39, // 144 + 0x0339, 0x1339, 0x0b39, 0x1b39, 0x0739, 0x1739, 0x0f39, 0x1f39, // 152 + 0x00b9, 0x10b9, 0x08b9, 0x18b9, 0x04b9, 0x14b9, 0x0cb9, 0x1cb9, // 160 + 0x02b9, 0x12b9, 0x0ab9, 0x1ab9, 0x06b9, 0x16b9, 0x0eb9, 0x1eb9, // 168 + 0x01b9, 0x11b9, 0x09b9, 0x19b9, 0x05b9, 0x15b9, 0x0db9, 0x1db9, // 176 + 0x03b9, 0x13b9, 0x0bb9, 0x1bb9, 0x07b9, 0x17b9, 0x0fb9, 0x1fb9, // 184 + 0x0079, 0x1079, 0x0879, 0x1879, 0x0479, 0x1479, 0x0c79, 0x1c79, // 192 + 0x0279, 0x1279, 0x0a79, 0x1a79, 0x0679, 0x1679, 0x0e79, 0x1e79, // 200 + 0x0179, 0x1179, 0x0979, 0x1979, 0x0579, 0x1579, 0x0d79, 0x1d79, // 208 + 0x0379, 0x1379, 0x0b79, 0x1b79, 0x0779, 0x1779, 0x0f79, 0x1f79, // 216 + 0x00f9, 0x10f9, 0x08f9, 0x18f9, 0x04f9, 0x14f9, 0x0cf9, 0x1cf9, // 224 + 0x02f9, 0x12f9, 0x0af9, 0x1af9, 0x06f9, 0x16f9, 0x0ef9, 0x1ef9, // 232 + 0x01f9, 0x11f9, 0x09f9, 0x19f9, 0x05f9, 0x15f9, 0x0df9, 0x1df9, // 240 + 0x03f9, 0x13f9, 0x0bf9, 0x1bf9, 0x07f9, 0x17f9, 0x0ff9, 0x1ff9, // 248 + 0x0007, 0x0407, 0x0207, 0x0607, 0x0107, 0x0507, 0x0307, 0x0707, // 256 + 0x0087, 0x0487, 0x0287, 0x0687, 0x0187, 0x0587, 0x0387, 0x0787, // 264 + 0x0047, 0x0447, 0x0247, 0x0647, 0x0147, 0x0547, 0x0347, 0x0747, // 272 + 0x0038, 0x0838, 0x0438, 0x0c38, 0x0238, 0x0a38, 0x0638, 0x0e38 // 280 +}; + +/* length from 3 to 258 converted to bit strings for use with fixed huffman + * coding. It was built by tools/dump_len.c. The format is the following : + * - bits 0..15 = code + * - bits 16..19 = #bits + */ +static const uint32_t len_fh[259] = { + 0x000000, 0x000000, 0x000000, 0x070040, /* 0-3 */ + 0x070020, 0x070060, 0x070010, 0x070050, /* 4-7 */ + 0x070030, 0x070070, 0x070008, 0x080048, /* 8-11 */ + 0x0800c8, 0x080028, 0x0800a8, 0x080068, /* 12-15 */ + 0x0800e8, 0x080018, 0x080098, 0x090058, /* 16-19 */ + 0x0900d8, 0x090158, 0x0901d8, 0x090038, /* 20-23 */ + 0x0900b8, 0x090138, 0x0901b8, 0x090078, /* 24-27 */ + 0x0900f8, 0x090178, 0x0901f8, 0x090004, /* 28-31 */ + 0x090084, 0x090104, 0x090184, 0x0a0044, /* 32-35 */ + 0x0a00c4, 0x0a0144, 0x0a01c4, 0x0a0244, /* 36-39 */ + 0x0a02c4, 0x0a0344, 0x0a03c4, 0x0a0024, /* 40-43 */ + 0x0a00a4, 0x0a0124, 0x0a01a4, 0x0a0224, /* 44-47 */ + 0x0a02a4, 0x0a0324, 0x0a03a4, 0x0a0064, /* 48-51 */ + 0x0a00e4, 0x0a0164, 0x0a01e4, 0x0a0264, /* 52-55 */ + 0x0a02e4, 0x0a0364, 0x0a03e4, 0x0a0014, /* 56-59 */ + 0x0a0094, 0x0a0114, 0x0a0194, 0x0a0214, /* 60-63 */ + 0x0a0294, 0x0a0314, 0x0a0394, 0x0b0054, /* 64-67 */ + 0x0b00d4, 0x0b0154, 0x0b01d4, 0x0b0254, /* 68-71 */ + 0x0b02d4, 0x0b0354, 0x0b03d4, 0x0b0454, /* 72-75 */ + 0x0b04d4, 0x0b0554, 0x0b05d4, 0x0b0654, /* 76-79 */ + 0x0b06d4, 0x0b0754, 0x0b07d4, 0x0b0034, /* 80-83 */ + 0x0b00b4, 0x0b0134, 0x0b01b4, 0x0b0234, /* 84-87 */ + 0x0b02b4, 0x0b0334, 0x0b03b4, 0x0b0434, /* 88-91 */ + 0x0b04b4, 0x0b0534, 0x0b05b4, 0x0b0634, /* 92-95 */ + 0x0b06b4, 0x0b0734, 0x0b07b4, 0x0b0074, /* 96-99 */ + 0x0b00f4, 0x0b0174, 0x0b01f4, 0x0b0274, /* 100-103 */ + 0x0b02f4, 0x0b0374, 0x0b03f4, 0x0b0474, /* 104-107 */ + 0x0b04f4, 0x0b0574, 0x0b05f4, 0x0b0674, /* 108-111 */ + 0x0b06f4, 0x0b0774, 0x0b07f4, 0x0c0003, /* 112-115 */ + 0x0c0103, 0x0c0203, 0x0c0303, 0x0c0403, /* 116-119 */ + 0x0c0503, 0x0c0603, 0x0c0703, 0x0c0803, /* 120-123 */ + 0x0c0903, 0x0c0a03, 0x0c0b03, 0x0c0c03, /* 124-127 */ + 0x0c0d03, 0x0c0e03, 0x0c0f03, 0x0d0083, /* 128-131 */ + 0x0d0183, 0x0d0283, 0x0d0383, 0x0d0483, /* 132-135 */ + 0x0d0583, 0x0d0683, 0x0d0783, 0x0d0883, /* 136-139 */ + 0x0d0983, 0x0d0a83, 0x0d0b83, 0x0d0c83, /* 140-143 */ + 0x0d0d83, 0x0d0e83, 0x0d0f83, 0x0d1083, /* 144-147 */ + 0x0d1183, 0x0d1283, 0x0d1383, 0x0d1483, /* 148-151 */ + 0x0d1583, 0x0d1683, 0x0d1783, 0x0d1883, /* 152-155 */ + 0x0d1983, 0x0d1a83, 0x0d1b83, 0x0d1c83, /* 156-159 */ + 0x0d1d83, 0x0d1e83, 0x0d1f83, 0x0d0043, /* 160-163 */ + 0x0d0143, 0x0d0243, 0x0d0343, 0x0d0443, /* 164-167 */ + 0x0d0543, 0x0d0643, 0x0d0743, 0x0d0843, /* 168-171 */ + 0x0d0943, 0x0d0a43, 0x0d0b43, 0x0d0c43, /* 172-175 */ + 0x0d0d43, 0x0d0e43, 0x0d0f43, 0x0d1043, /* 176-179 */ + 0x0d1143, 0x0d1243, 0x0d1343, 0x0d1443, /* 180-183 */ + 0x0d1543, 0x0d1643, 0x0d1743, 0x0d1843, /* 184-187 */ + 0x0d1943, 0x0d1a43, 0x0d1b43, 0x0d1c43, /* 188-191 */ + 0x0d1d43, 0x0d1e43, 0x0d1f43, 0x0d00c3, /* 192-195 */ + 0x0d01c3, 0x0d02c3, 0x0d03c3, 0x0d04c3, /* 196-199 */ + 0x0d05c3, 0x0d06c3, 0x0d07c3, 0x0d08c3, /* 200-203 */ + 0x0d09c3, 0x0d0ac3, 0x0d0bc3, 0x0d0cc3, /* 204-207 */ + 0x0d0dc3, 0x0d0ec3, 0x0d0fc3, 0x0d10c3, /* 208-211 */ + 0x0d11c3, 0x0d12c3, 0x0d13c3, 0x0d14c3, /* 212-215 */ + 0x0d15c3, 0x0d16c3, 0x0d17c3, 0x0d18c3, /* 216-219 */ + 0x0d19c3, 0x0d1ac3, 0x0d1bc3, 0x0d1cc3, /* 220-223 */ + 0x0d1dc3, 0x0d1ec3, 0x0d1fc3, 0x0d0023, /* 224-227 */ + 0x0d0123, 0x0d0223, 0x0d0323, 0x0d0423, /* 228-231 */ + 0x0d0523, 0x0d0623, 0x0d0723, 0x0d0823, /* 232-235 */ + 0x0d0923, 0x0d0a23, 0x0d0b23, 0x0d0c23, /* 236-239 */ + 0x0d0d23, 0x0d0e23, 0x0d0f23, 0x0d1023, /* 240-243 */ + 0x0d1123, 0x0d1223, 0x0d1323, 0x0d1423, /* 244-247 */ + 0x0d1523, 0x0d1623, 0x0d1723, 0x0d1823, /* 248-251 */ + 0x0d1923, 0x0d1a23, 0x0d1b23, 0x0d1c23, /* 252-255 */ + 0x0d1d23, 0x0d1e23, 0x0800a3 /* 256-258 */ +}; + +/* This horrible mess is needed to shut up the fallthrough warning since the + * stupid comment approach doesn't resist to separate preprocessing (e.g. as + * used in distcc). Note that compilers which support the fallthrough attribute + * also support __has_attribute. + */ +#ifndef __fallthrough +# ifdef __has_attribute +# if __has_attribute(fallthrough) +# define __fallthrough __attribute__((fallthrough)) +# else +# define __fallthrough do { } while (0) +# endif +# else +# define __fallthrough do { } while (0) +# endif +#endif + +#if !defined(__ARM_FEATURE_CRC32) +static uint32_t crc32_fast[4][256]; +#endif + +static uint32_t fh_dist_table[32768]; + +#if !defined(__ARM_FEATURE_CRC32) +/* Make the table for a fast CRC. + * Not thread-safe, must be called exactly once. + */ +static inline void __slz_make_crc_table(void) +{ + uint32_t c; + int n, k; + + for (n = 0; n < 256; n++) { + c = (uint32_t) n ^ 255; + for (k = 0; k < 8; k++) { + if (c & 1) { + c = 0xedb88320 ^ (c >> 1); + } else { + c = c >> 1; + } + } + crc32_fast[0][n] = c ^ 0xff000000; + } + + /* Note: here we *do not* have to invert the bits corresponding to the + * byte position, because [0] already has the 8 highest bits inverted, + * and these bits are shifted by 8 at the end of the operation, which + * results in having the next 8 bits shifted in turn. That's why we + * have the xor in the index used just after a computation. + */ + for (n = 0; n < 256; n++) { + crc32_fast[1][n] = 0xff000000 ^ crc32_fast[0][(0xff000000 ^ crc32_fast[0][n] ^ 0xff) & 0xff] ^ (crc32_fast[0][n] >> 8); + crc32_fast[2][n] = 0xff000000 ^ crc32_fast[0][(0x00ff0000 ^ crc32_fast[1][n] ^ 0xff) & 0xff] ^ (crc32_fast[1][n] >> 8); + crc32_fast[3][n] = 0xff000000 ^ crc32_fast[0][(0x0000ff00 ^ crc32_fast[2][n] ^ 0xff) & 0xff] ^ (crc32_fast[2][n] >> 8); + } +} +#endif + +/* Returns code for lengths 1 to 32768. The bit size for the next value can be + * found this way : + * + * bits = code >> 1; + * if (bits) + * bits--; + * + */ +static inline uint32_t dist_to_code(uint32_t l) +{ + uint32_t code; + + code = 0; + switch (l) { + case 24577 ... 32768: code++; __fallthrough; + case 16385 ... 24576: code++; __fallthrough; + case 12289 ... 16384: code++; __fallthrough; + case 8193 ... 12288: code++; __fallthrough; + case 6145 ... 8192: code++; __fallthrough; + case 4097 ... 6144: code++; __fallthrough; + case 3073 ... 4096: code++; __fallthrough; + case 2049 ... 3072: code++; __fallthrough; + case 1537 ... 2048: code++; __fallthrough; + case 1025 ... 1536: code++; __fallthrough; + case 769 ... 1024: code++; __fallthrough; + case 513 ... 768: code++; __fallthrough; + case 385 ... 512: code++; __fallthrough; + case 257 ... 384: code++; __fallthrough; + case 193 ... 256: code++; __fallthrough; + case 129 ... 192: code++; __fallthrough; + case 97 ... 128: code++; __fallthrough; + case 65 ... 96: code++; __fallthrough; + case 49 ... 64: code++; __fallthrough; + case 33 ... 48: code++; __fallthrough; + case 25 ... 32: code++; __fallthrough; + case 17 ... 24: code++; __fallthrough; + case 13 ... 16: code++; __fallthrough; + case 9 ... 12: code++; __fallthrough; + case 7 ... 8: code++; __fallthrough; + case 5 ... 6: code++; __fallthrough; + case 4 : code++; __fallthrough; + case 3 : code++; __fallthrough; + case 2 : code++; + } + + return code; +} + +/* not thread-safe, must be called exactly once */ +static inline void __slz_prepare_dist_table() +{ + uint32_t dist; + uint32_t code; + uint32_t bits; + + for (dist = 0; dist < sizeof(fh_dist_table) / sizeof(*fh_dist_table); dist++) { + code = dist_to_code(dist + 1); + bits = code >> 1; + if (bits) + bits--; + + /* Distance codes are stored on 5 bits reversed. The RFC + * doesn't state that they are reversed, but it's the only + * way it works. + */ + code = ((code & 0x01) << 4) | ((code & 0x02) << 2) | + (code & 0x04) | + ((code & 0x08) >> 2) | ((code & 0x10) >> 4); + + code += (dist & ((1 << bits) - 1)) << 5; + fh_dist_table[dist] = (code << 5) + bits + 5; + } +} diff --git a/include/import/slz.h b/include/import/slz.h new file mode 100644 index 0000000..901a790 --- /dev/null +++ b/include/import/slz.h @@ -0,0 +1,200 @@ +/* + * Copyright (C) 2013-2015 Willy Tarreau <w@1wt.eu> + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _SLZ_H +#define _SLZ_H + +#include <inttypes.h> + +/* We have two macros UNALIGNED_LE_OK and UNALIGNED_FASTER. The latter indicates + * that using unaligned data is faster than a simple shift. On x86 32-bit at + * least it is not the case as the per-byte access is 30% faster. A core2-duo on + * x86_64 is 7% faster to read one byte + shifting by 8 than to read one word, + * but a core i5 is 7% faster doing the unaligned read, so we privilege more + * recent implementations here. + */ +#if defined(__x86_64__) +#define UNALIGNED_LE_OK +#define UNALIGNED_FASTER +#define USE_64BIT_QUEUE +#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) +#define UNALIGNED_LE_OK +//#define UNALIGNED_FASTER +#elif defined(__ARMEL__) && defined(__ARM_ARCH_7A__) +#define UNALIGNED_LE_OK +#define UNALIGNED_FASTER +#elif defined(__ARM_ARCH_8A) || defined(__ARM_FEATURE_UNALIGNED) +#define UNALIGNED_LE_OK +#define UNALIGNED_FASTER +#endif + +/* Log2 of the size of the hash table used for the references table. */ +#define HASH_BITS 13 + +enum slz_state { + SLZ_ST_INIT, /* stream initialized */ + SLZ_ST_EOB, /* header or end of block already sent */ + SLZ_ST_FIXED, /* inside a fixed huffman sequence */ + SLZ_ST_LAST, /* last block, BFINAL sent */ + SLZ_ST_DONE, /* BFINAL+EOB sent BFINAL */ + SLZ_ST_END /* end sent (BFINAL, EOB, CRC + len) */ +}; + +enum { + SLZ_FMT_GZIP, /* RFC1952: gzip envelope and crc32 for CRC */ + SLZ_FMT_ZLIB, /* RFC1950: zlib envelope and adler-32 for CRC */ + SLZ_FMT_DEFLATE, /* RFC1951: raw deflate, and no crc */ +}; + +struct slz_stream { +#ifdef USE_64BIT_QUEUE + uint64_t queue; /* last pending bits, LSB first */ +#else + uint32_t queue; /* last pending bits, LSB first */ +#endif + uint32_t qbits; /* number of bits in queue, < 8 on 32-bit, < 32 on 64-bit */ + unsigned char *outbuf; /* set by encode() */ + uint16_t state; /* one of slz_state */ + uint8_t level:1; /* 0 = no compression, 1 = compression */ + uint8_t format:2; /* SLZ_FMT_* */ + uint8_t unused1; /* unused for now */ + uint32_t crc32; + uint32_t ilen; +}; + +/* Functions specific to rfc1951 (deflate) */ +long slz_rfc1951_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more); +int slz_rfc1951_init(struct slz_stream *strm, int level); +int slz_rfc1951_flush(struct slz_stream *strm, unsigned char *buf); +int slz_rfc1951_finish(struct slz_stream *strm, unsigned char *buf); + +/* Functions specific to rfc1952 (gzip) */ +uint32_t slz_crc32_by1(uint32_t crc, const unsigned char *buf, int len); +uint32_t slz_crc32_by4(uint32_t crc, const unsigned char *buf, int len); +long slz_rfc1952_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more); +int slz_rfc1952_send_header(struct slz_stream *strm, unsigned char *buf); +int slz_rfc1952_init(struct slz_stream *strm, int level); +int slz_rfc1952_flush(struct slz_stream *strm, unsigned char *buf); +int slz_rfc1952_finish(struct slz_stream *strm, unsigned char *buf); + +/* Functions specific to rfc1950 (zlib) */ +uint32_t slz_adler32_by1(uint32_t crc, const unsigned char *buf, int len); +uint32_t slz_adler32_block(uint32_t crc, const unsigned char *buf, long len); +long slz_rfc1950_encode(struct slz_stream *strm, unsigned char *out, const unsigned char *in, long ilen, int more); +int slz_rfc1950_send_header(struct slz_stream *strm, unsigned char *buf); +int slz_rfc1950_init(struct slz_stream *strm, int level); +int slz_rfc1950_flush(struct slz_stream *strm, unsigned char *buf); +int slz_rfc1950_finish(struct slz_stream *strm, unsigned char *buf); + +/* generic functions */ + +/* Initializes stream <strm>. It will configure the stream to use format + * <format> for the data, which must be one of SLZ_FMT_*. The compression level + * passed in <level> is set. This value can only be 0 (no compression) or 1 + * (compression) and other values will lead to unpredictable behaviour. The + * function should always return 0. + */ +static inline int slz_init(struct slz_stream *strm, int level, int format) +{ + int ret; + + if (format == SLZ_FMT_GZIP) + ret = slz_rfc1952_init(strm, level); + else if (format == SLZ_FMT_ZLIB) + ret = slz_rfc1950_init(strm, level); + else { /* deflate for anything else */ + ret = slz_rfc1951_init(strm, level); + strm->format = format; + } + return ret; +} + +/* Encodes the block according to the format used by the stream. This means + * that the CRC of the input block may be computed according to the CRC32 or + * adler-32 algorithms. The number of output bytes is returned. + */ +static inline long slz_encode(struct slz_stream *strm, void *out, + const void *in, long ilen, int more) +{ + long ret; + + if (strm->format == SLZ_FMT_GZIP) + ret = slz_rfc1952_encode(strm, (unsigned char *) out, (const unsigned char *) in, ilen, more); + else if (strm->format == SLZ_FMT_ZLIB) + ret = slz_rfc1950_encode(strm, (unsigned char *) out, (const unsigned char *) in, ilen, more); + else /* deflate for other ones */ + ret = slz_rfc1951_encode(strm, (unsigned char *) out, (const unsigned char *) in, ilen, more); + + return ret; +} + +/* Flushes pending bits and sends the trailer for stream <strm> into buffer + * <buf> if needed. When it's done, the stream state is updated to SLZ_ST_END. + * It returns the number of bytes emitted. The trailer consists in flushing the + * possibly pending bits from the queue (up to 24 bits), rounding to the next + * byte, then 4 bytes for the CRC when doing zlib/gzip, then another 4 bytes + * for the input length for gzip. That may about to 4+4+4 = 12 bytes, that the + * caller must ensure are available before calling the function. Note that if + * the initial header was never sent, it will be sent first as well (up to 10 + * extra bytes). + */ +static inline int slz_finish(struct slz_stream *strm, void *buf) +{ + int ret; + + if (strm->format == SLZ_FMT_GZIP) + ret = slz_rfc1952_finish(strm, (unsigned char *) buf); + else if (strm->format == SLZ_FMT_ZLIB) + ret = slz_rfc1950_finish(strm, (unsigned char *) buf); + else /* deflate for other ones */ + ret = slz_rfc1951_finish(strm, (unsigned char *) buf); + + return ret; +} + +/* Flushes any pending data for stream <strm> into buffer <buf>, then emits an + * empty literal block to byte-align the output, allowing to completely flush + * the queue. Note that if the initial header was never sent, it will be sent + * first as well (0, 2 or 10 extra bytes). This requires that the output buffer + * still has this plus the size of the queue available (up to 4 bytes), plus + * one byte for (BFINAL,BTYPE), plus 4 bytes for LEN+NLEN, or a total of 19 + * bytes in the worst case. The number of bytes emitted is returned. It is + * guaranteed that the queue is empty on return. This may cause some overhead + * by adding needless 5-byte blocks if called to often. + */ +static inline int slz_flush(struct slz_stream *strm, void *buf) +{ + int ret; + + if (strm->format == SLZ_FMT_GZIP) + ret = slz_rfc1952_flush(strm, (unsigned char *) buf); + else if (strm->format == SLZ_FMT_ZLIB) + ret = slz_rfc1950_flush(strm, (unsigned char *) buf); + else /* deflate for other ones */ + ret = slz_rfc1951_flush(strm, (unsigned char *) buf); + + return ret; +} + +#endif diff --git a/include/import/xxhash.h b/include/import/xxhash.h new file mode 100644 index 0000000..a18e8c7 --- /dev/null +++ b/include/import/xxhash.h @@ -0,0 +1,6773 @@ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (C) 2012-2021 Yann Collet + * + * BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at: + * - xxHash homepage: https://www.xxhash.com + * - xxHash source repository: https://github.com/Cyan4973/xxHash + */ + +/*! + * @mainpage xxHash + * + * xxHash is an extremely fast non-cryptographic hash algorithm, working at RAM speed + * limits. + * + * It is proposed in four flavors, in three families: + * 1. @ref XXH32_family + * - Classic 32-bit hash function. Simple, compact, and runs on almost all + * 32-bit and 64-bit systems. + * 2. @ref XXH64_family + * - Classic 64-bit adaptation of XXH32. Just as simple, and runs well on most + * 64-bit systems (but _not_ 32-bit systems). + * 3. @ref XXH3_family + * - Modern 64-bit and 128-bit hash function family which features improved + * strength and performance across the board, especially on smaller data. + * It benefits greatly from SIMD and 64-bit without requiring it. + * + * Benchmarks + * --- + * The reference system uses an Intel i7-9700K CPU, and runs Ubuntu x64 20.04. + * The open source benchmark program is compiled with clang v10.0 using -O3 flag. + * + * | Hash Name | ISA ext | Width | Large Data Speed | Small Data Velocity | + * | -------------------- | ------- | ----: | ---------------: | ------------------: | + * | XXH3_64bits() | @b AVX2 | 64 | 59.4 GB/s | 133.1 | + * | MeowHash | AES-NI | 128 | 58.2 GB/s | 52.5 | + * | XXH3_128bits() | @b AVX2 | 128 | 57.9 GB/s | 118.1 | + * | CLHash | PCLMUL | 64 | 37.1 GB/s | 58.1 | + * | XXH3_64bits() | @b SSE2 | 64 | 31.5 GB/s | 133.1 | + * | XXH3_128bits() | @b SSE2 | 128 | 29.6 GB/s | 118.1 | + * | RAM sequential read | | N/A | 28.0 GB/s | N/A | + * | ahash | AES-NI | 64 | 22.5 GB/s | 107.2 | + * | City64 | | 64 | 22.0 GB/s | 76.6 | + * | T1ha2 | | 64 | 22.0 GB/s | 99.0 | + * | City128 | | 128 | 21.7 GB/s | 57.7 | + * | FarmHash | AES-NI | 64 | 21.3 GB/s | 71.9 | + * | XXH64() | | 64 | 19.4 GB/s | 71.0 | + * | SpookyHash | | 64 | 19.3 GB/s | 53.2 | + * | Mum | | 64 | 18.0 GB/s | 67.0 | + * | CRC32C | SSE4.2 | 32 | 13.0 GB/s | 57.9 | + * | XXH32() | | 32 | 9.7 GB/s | 71.9 | + * | City32 | | 32 | 9.1 GB/s | 66.0 | + * | Blake3* | @b AVX2 | 256 | 4.4 GB/s | 8.1 | + * | Murmur3 | | 32 | 3.9 GB/s | 56.1 | + * | SipHash* | | 64 | 3.0 GB/s | 43.2 | + * | Blake3* | @b SSE2 | 256 | 2.4 GB/s | 8.1 | + * | HighwayHash | | 64 | 1.4 GB/s | 6.0 | + * | FNV64 | | 64 | 1.2 GB/s | 62.7 | + * | Blake2* | | 256 | 1.1 GB/s | 5.1 | + * | SHA1* | | 160 | 0.8 GB/s | 5.6 | + * | MD5* | | 128 | 0.6 GB/s | 7.8 | + * @note + * - Hashes which require a specific ISA extension are noted. SSE2 is also noted, + * even though it is mandatory on x64. + * - Hashes with an asterisk are cryptographic. Note that MD5 is non-cryptographic + * by modern standards. + * - Small data velocity is a rough average of algorithm's efficiency for small + * data. For more accurate information, see the wiki. + * - More benchmarks and strength tests are found on the wiki: + * https://github.com/Cyan4973/xxHash/wiki + * + * Usage + * ------ + * All xxHash variants use a similar API. Changing the algorithm is a trivial + * substitution. + * + * @pre + * For functions which take an input and length parameter, the following + * requirements are assumed: + * - The range from [`input`, `input + length`) is valid, readable memory. + * - The only exception is if the `length` is `0`, `input` may be `NULL`. + * - For C++, the objects must have the *TriviallyCopyable* property, as the + * functions access bytes directly as if it was an array of `unsigned char`. + * + * @anchor single_shot_example + * **Single Shot** + * + * These functions are stateless functions which hash a contiguous block of memory, + * immediately returning the result. They are the easiest and usually the fastest + * option. + * + * XXH32(), XXH64(), XXH3_64bits(), XXH3_128bits() + * + * @code{.c} + * #include <string.h> + * #include "xxhash.h" + * + * // Example for a function which hashes a null terminated string with XXH32(). + * XXH32_hash_t hash_string(const char* string, XXH32_hash_t seed) + * { + * // NULL pointers are only valid if the length is zero + * size_t length = (string == NULL) ? 0 : strlen(string); + * return XXH32(string, length, seed); + * } + * @endcode + * + * @anchor streaming_example + * **Streaming** + * + * These groups of functions allow incremental hashing of unknown size, even + * more than what would fit in a size_t. + * + * XXH32_reset(), XXH64_reset(), XXH3_64bits_reset(), XXH3_128bits_reset() + * + * @code{.c} + * #include <stdio.h> + * #include <assert.h> + * #include "xxhash.h" + * // Example for a function which hashes a FILE incrementally with XXH3_64bits(). + * XXH64_hash_t hashFile(FILE* f) + * { + * // Allocate a state struct. Do not just use malloc() or new. + * XXH3_state_t* state = XXH3_createState(); + * assert(state != NULL && "Out of memory!"); + * // Reset the state to start a new hashing session. + * XXH3_64bits_reset(state); + * char buffer[4096]; + * size_t count; + * // Read the file in chunks + * while ((count = fread(buffer, 1, sizeof(buffer), f)) != 0) { + * // Run update() as many times as necessary to process the data + * XXH3_64bits_update(state, buffer, count); + * } + * // Retrieve the finalized hash. This will not change the state. + * XXH64_hash_t result = XXH3_64bits_digest(state); + * // Free the state. Do not use free(). + * XXH3_freeState(state); + * return result; + * } + * @endcode + * + * @file xxhash.h + * xxHash prototypes and implementation + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* **************************** + * INLINE mode + ******************************/ +/*! + * @defgroup public Public API + * Contains details on the public xxHash functions. + * @{ + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Gives access to internal state declaration, required for static allocation. + * + * Incompatible with dynamic linking, due to risks of ABI changes. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #include "xxhash.h" + * @endcode + */ +# define XXH_STATIC_LINKING_ONLY +/* Do not undef XXH_STATIC_LINKING_ONLY for Doxygen */ + +/*! + * @brief Gives access to internal definitions. + * + * Usage: + * @code{.c} + * #define XXH_STATIC_LINKING_ONLY + * #define XXH_IMPLEMENTATION + * #include "xxhash.h" + * @endcode + */ +# define XXH_IMPLEMENTATION +/* Do not undef XXH_IMPLEMENTATION for Doxygen */ + +/*! + * @brief Exposes the implementation and marks all functions as `inline`. + * + * Use these build macros to inline xxhash into the target unit. + * Inlining improves performance on small inputs, especially when the length is + * expressed as a compile-time constant: + * + * https://fastcompression.blogspot.com/2018/03/xxhash-for-small-keys-impressive-power.html + * + * It also keeps xxHash symbols private to the unit, so they are not exported. + * + * Usage: + * @code{.c} + * #define XXH_INLINE_ALL + * #include "xxhash.h" + * @endcode + * Do not compile and link xxhash.o as a separate object, as it is not useful. + */ +# define XXH_INLINE_ALL +# undef XXH_INLINE_ALL +/*! + * @brief Exposes the implementation without marking functions as inline. + */ +# define XXH_PRIVATE_API +# undef XXH_PRIVATE_API +/*! + * @brief Emulate a namespace by transparently prefixing all symbols. + * + * If you want to include _and expose_ xxHash functions from within your own + * library, but also want to avoid symbol collisions with other libraries which + * may also include xxHash, you can use @ref XXH_NAMESPACE to automatically prefix + * any public symbol from xxhash library with the value of @ref XXH_NAMESPACE + * (therefore, avoid empty or numeric values). + * + * Note that no change is required within the calling program as long as it + * includes `xxhash.h`: Regular symbol names will be automatically translated + * by this header. + */ +# define XXH_NAMESPACE /* YOUR NAME HERE */ +# undef XXH_NAMESPACE +#endif + +#if (defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)) \ + && !defined(XXH_INLINE_ALL_31684351384) + /* this section should be traversed only once */ +# define XXH_INLINE_ALL_31684351384 + /* give access to the advanced API, required to compile implementations */ +# undef XXH_STATIC_LINKING_ONLY /* avoid macro redef */ +# define XXH_STATIC_LINKING_ONLY + /* make all functions private */ +# undef XXH_PUBLIC_API +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else + /* note: this version may generate warnings for unused static functions */ +# define XXH_PUBLIC_API static +# endif + + /* + * This part deals with the special case where a unit wants to inline xxHash, + * but "xxhash.h" has previously been included without XXH_INLINE_ALL, + * such as part of some previously included *.h header file. + * Without further action, the new include would just be ignored, + * and functions would effectively _not_ be inlined (silent failure). + * The following macros solve this situation by prefixing all inlined names, + * avoiding naming collision with previous inclusions. + */ + /* Before that, we unconditionally #undef all symbols, + * in case they were already defined with XXH_NAMESPACE. + * They will then be redefined for XXH_INLINE_ALL + */ +# undef XXH_versionNumber + /* XXH32 */ +# undef XXH32 +# undef XXH32_createState +# undef XXH32_freeState +# undef XXH32_reset +# undef XXH32_update +# undef XXH32_digest +# undef XXH32_copyState +# undef XXH32_canonicalFromHash +# undef XXH32_hashFromCanonical + /* XXH64 */ +# undef XXH64 +# undef XXH64_createState +# undef XXH64_freeState +# undef XXH64_reset +# undef XXH64_update +# undef XXH64_digest +# undef XXH64_copyState +# undef XXH64_canonicalFromHash +# undef XXH64_hashFromCanonical + /* XXH3_64bits */ +# undef XXH3_64bits +# undef XXH3_64bits_withSecret +# undef XXH3_64bits_withSeed +# undef XXH3_64bits_withSecretandSeed +# undef XXH3_createState +# undef XXH3_freeState +# undef XXH3_copyState +# undef XXH3_64bits_reset +# undef XXH3_64bits_reset_withSeed +# undef XXH3_64bits_reset_withSecret +# undef XXH3_64bits_update +# undef XXH3_64bits_digest +# undef XXH3_generateSecret + /* XXH3_128bits */ +# undef XXH128 +# undef XXH3_128bits +# undef XXH3_128bits_withSeed +# undef XXH3_128bits_withSecret +# undef XXH3_128bits_reset +# undef XXH3_128bits_reset_withSeed +# undef XXH3_128bits_reset_withSecret +# undef XXH3_128bits_reset_withSecretandSeed +# undef XXH3_128bits_update +# undef XXH3_128bits_digest +# undef XXH128_isEqual +# undef XXH128_cmp +# undef XXH128_canonicalFromHash +# undef XXH128_hashFromCanonical + /* Finally, free the namespace itself */ +# undef XXH_NAMESPACE + + /* employ the namespace for XXH_INLINE_ALL */ +# define XXH_NAMESPACE XXH_INLINE_ + /* + * Some identifiers (enums, type names) are not symbols, + * but they must nonetheless be renamed to avoid redeclaration. + * Alternative solution: do not redeclare them. + * However, this requires some #ifdefs, and has a more dispersed impact. + * Meanwhile, renaming can be achieved in a single place. + */ +# define XXH_IPREF(Id) XXH_NAMESPACE ## Id +# define XXH_OK XXH_IPREF(XXH_OK) +# define XXH_ERROR XXH_IPREF(XXH_ERROR) +# define XXH_errorcode XXH_IPREF(XXH_errorcode) +# define XXH32_canonical_t XXH_IPREF(XXH32_canonical_t) +# define XXH64_canonical_t XXH_IPREF(XXH64_canonical_t) +# define XXH128_canonical_t XXH_IPREF(XXH128_canonical_t) +# define XXH32_state_s XXH_IPREF(XXH32_state_s) +# define XXH32_state_t XXH_IPREF(XXH32_state_t) +# define XXH64_state_s XXH_IPREF(XXH64_state_s) +# define XXH64_state_t XXH_IPREF(XXH64_state_t) +# define XXH3_state_s XXH_IPREF(XXH3_state_s) +# define XXH3_state_t XXH_IPREF(XXH3_state_t) +# define XXH128_hash_t XXH_IPREF(XXH128_hash_t) + /* Ensure the header is parsed again, even if it was previously included */ +# undef XXHASH_H_5627135585666179 +# undef XXHASH_H_STATIC_13879238742 +#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */ + +/* **************************************************************** + * Stable API + *****************************************************************/ +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + +/*! @brief Marks a global symbol. */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +/* XXH32 */ +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +/* XXH64 */ +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +/* XXH3_64bits */ +# define XXH3_64bits XXH_NAME2(XXH_NAMESPACE, XXH3_64bits) +# define XXH3_64bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecret) +# define XXH3_64bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSeed) +# define XXH3_64bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_withSecretandSeed) +# define XXH3_createState XXH_NAME2(XXH_NAMESPACE, XXH3_createState) +# define XXH3_freeState XXH_NAME2(XXH_NAMESPACE, XXH3_freeState) +# define XXH3_copyState XXH_NAME2(XXH_NAMESPACE, XXH3_copyState) +# define XXH3_64bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset) +# define XXH3_64bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSeed) +# define XXH3_64bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecret) +# define XXH3_64bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_reset_withSecretandSeed) +# define XXH3_64bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_update) +# define XXH3_64bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_64bits_digest) +# define XXH3_generateSecret XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret) +# define XXH3_generateSecret_fromSeed XXH_NAME2(XXH_NAMESPACE, XXH3_generateSecret_fromSeed) +/* XXH3_128bits */ +# define XXH128 XXH_NAME2(XXH_NAMESPACE, XXH128) +# define XXH3_128bits XXH_NAME2(XXH_NAMESPACE, XXH3_128bits) +# define XXH3_128bits_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSeed) +# define XXH3_128bits_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecret) +# define XXH3_128bits_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_withSecretandSeed) +# define XXH3_128bits_reset XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset) +# define XXH3_128bits_reset_withSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSeed) +# define XXH3_128bits_reset_withSecret XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecret) +# define XXH3_128bits_reset_withSecretandSeed XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_reset_withSecretandSeed) +# define XXH3_128bits_update XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_update) +# define XXH3_128bits_digest XXH_NAME2(XXH_NAMESPACE, XXH3_128bits_digest) +# define XXH128_isEqual XXH_NAME2(XXH_NAMESPACE, XXH128_isEqual) +# define XXH128_cmp XXH_NAME2(XXH_NAMESPACE, XXH128_cmp) +# define XXH128_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH128_canonicalFromHash) +# define XXH128_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH128_hashFromCanonical) +#endif + + +/* ************************************* +* Compiler specifics +***************************************/ + +/* specific declaration modes for Windows */ +#if !defined(XXH_INLINE_ALL) && !defined(XXH_PRIVATE_API) +# if defined(WIN32) && defined(_MSC_VER) && (defined(XXH_IMPORT) || defined(XXH_EXPORT)) +# ifdef XXH_EXPORT +# define XXH_PUBLIC_API __declspec(dllexport) +# elif XXH_IMPORT +# define XXH_PUBLIC_API __declspec(dllimport) +# endif +# else +# define XXH_PUBLIC_API /* do nothing */ +# endif +#endif + +#if defined (__GNUC__) +# define XXH_CONSTF __attribute__((const)) +# define XXH_PUREF __attribute__((pure)) +# define XXH_MALLOCF __attribute__((malloc)) +#else +# define XXH_CONSTF /* disable */ +# define XXH_PUREF +# define XXH_MALLOCF +#endif + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 8 +#define XXH_VERSION_RELEASE 2 +/*! @brief Version number, encoded as two digits each */ +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) + +/*! + * @brief Obtains the xxHash version. + * + * This is mostly useful when xxHash is compiled as a shared library, + * since the returned value comes from the library, as opposed to header file. + * + * @return @ref XXH_VERSION_NUMBER of the invoked library. + */ +XXH_PUBLIC_API XXH_CONSTF unsigned XXH_versionNumber (void); + + +/* **************************** +* Common basic types +******************************/ +#include <stddef.h> /* size_t */ +/*! + * @brief Exit code for the streaming API. + */ +typedef enum { + XXH_OK = 0, /*!< OK */ + XXH_ERROR /*!< Error */ +} XXH_errorcode; + + +/*-********************************************************************** +* 32-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* Don't show <stdint.h> include */ +/*! + * @brief An unsigned 32-bit integer. + * + * Not necessarily defined to `uint32_t` but functionally equivalent. + */ +typedef uint32_t XXH32_hash_t; + +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint32_t XXH32_hash_t; + +#else +# include <limits.h> +# if UINT_MAX == 0xFFFFFFFFUL + typedef unsigned int XXH32_hash_t; +# elif ULONG_MAX == 0xFFFFFFFFUL + typedef unsigned long XXH32_hash_t; +# else +# error "unsupported platform: need a 32-bit type" +# endif +#endif + +/*! + * @} + * + * @defgroup XXH32_family XXH32 family + * @ingroup public + * Contains functions used in the classic 32-bit xxHash algorithm. + * + * @note + * XXH32 is useful for older platforms, with no or poor 64-bit performance. + * Note that the @ref XXH3_family provides competitive speed for both 32-bit + * and 64-bit systems, and offers true 64/128 bit hash results. + * + * @see @ref XXH64_family, @ref XXH3_family : Other xxHash families + * @see @ref XXH32_impl for implementation details + * @{ + */ + +/*! + * @brief Calculates the 32-bit hash of @p input using xxHash32. + * + * Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark): 5.4 GB/s + * + * See @ref single_shot_example "Single Shot Example" for an example. + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 32-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 32-bit hash value. + * + * @see + * XXH64(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH32_createState(), XXH32_update(), XXH32_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32 (const void* input, size_t length, XXH32_hash_t seed); + +#ifndef XXH_NO_STREAM +/*! + * Streaming functions generate the xxHash value from an incremental input. + * This method is slower than single-call functions, due to state management. + * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized. + * + * An XXH state must first be allocated using `XXH*_createState()`. + * + * Start a new hash by initializing the state with a seed using `XXH*_reset()`. + * + * Then, feed the hash state by calling `XXH*_update()` as many times as necessary. + * + * The function returns an error code, with 0 meaning OK, and any other value + * meaning there is an error. + * + * Finally, a hash value can be produced anytime, by using `XXH*_digest()`. + * This function returns the nn-bits hash as an int or long long. + * + * It's still possible to continue inserting input into the hash state after a + * digest, and generate new hash values later on by invoking `XXH*_digest()`. + * + * When done, release the state using `XXH*_freeState()`. + * + * @see streaming_example at the top of @ref xxhash.h for an example. + */ + +/*! + * @typedef struct XXH32_state_s XXH32_state_t + * @brief The opaque state struct for the XXH32 streaming API. + * + * @see XXH32_state_s for details. + */ +typedef struct XXH32_state_s XXH32_state_t; + +/*! + * @brief Allocates an @ref XXH32_state_t. + * + * Must be freed with XXH32_freeState(). + * @return An allocated XXH32_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH32_state_t* XXH32_createState(void); +/*! + * @brief Frees an @ref XXH32_state_t. + * + * Must be allocated with XXH32_createState(). + * @param statePtr A pointer to an @ref XXH32_state_t allocated with @ref XXH32_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); +/*! + * @brief Copies one @ref XXH32_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dst_state, const XXH32_state_t* src_state); + +/*! + * @brief Resets an @ref XXH32_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH32_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 32-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, XXH32_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH32_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH32_state_t. + * + * @note + * Calling XXH32_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash32 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/* + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * This the simplest and fastest format for further post-processing. + * + * However, this leaves open the question of what is the order on the byte level, + * since little and big endian conventions will store the same number differently. + * + * The canonical representation settles this issue by mandating big-endian + * convention, the same convention as human-readable numbers (large digits first). + * + * When writing hash values to storage, sending them over a network, or printing + * them, it's highly recommended to use the canonical representation to ensure + * portability across a wider range of systems, present and future. + * + * The following functions allow transformation of hash values to and from + * canonical format. + */ + +/*! + * @brief Canonical (big endian) representation of @ref XXH32_hash_t. + */ +typedef struct { + unsigned char digest[4]; /*!< Hash bytes, big endian */ +} XXH32_canonical_t; + +/*! + * @brief Converts an @ref XXH32_hash_t to a big endian @ref XXH32_canonical_t. + * + * @param dst The @ref XXH32_canonical_t pointer to be stored to. + * @param hash The @ref XXH32_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); + +/*! + * @brief Converts an @ref XXH32_canonical_t to a native @ref XXH32_hash_t. + * + * @param src The @ref XXH32_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); + + +/*! @cond Doxygen ignores this part */ +#ifdef __has_attribute +# define XXH_HAS_ATTRIBUTE(x) __has_attribute(x) +#else +# define XXH_HAS_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * C23 __STDC_VERSION__ number hasn't been specified yet. For now + * leave as `201711L` (C17 + 1). + * TODO: Update to correct value when its been specified. + */ +#define XXH_C23_VN 201711L +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* C-language Attributes are added in C23. */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) && defined(__has_c_attribute) +# define XXH_HAS_C_ATTRIBUTE(x) __has_c_attribute(x) +#else +# define XXH_HAS_C_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +#if defined(__cplusplus) && defined(__has_cpp_attribute) +# define XXH_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +# define XXH_HAS_CPP_ATTRIBUTE(x) 0 +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_FALLTHROUGH macro for annotating switch case with the 'fallthrough' attribute + * introduced in CPP17 and C23. + * CPP17 : https://en.cppreference.com/w/cpp/language/attributes/fallthrough + * C23 : https://en.cppreference.com/w/c/language/attributes/fallthrough + */ +#if XXH_HAS_C_ATTRIBUTE(fallthrough) || XXH_HAS_CPP_ATTRIBUTE(fallthrough) +# define XXH_FALLTHROUGH [[fallthrough]] +#elif XXH_HAS_ATTRIBUTE(__fallthrough__) +# define XXH_FALLTHROUGH __attribute__ ((__fallthrough__)) +#else +# define XXH_FALLTHROUGH /* fallthrough */ +#endif +/*! @endcond */ + +/*! @cond Doxygen ignores this part */ +/* + * Define XXH_NOESCAPE for annotated pointers in public API. + * https://clang.llvm.org/docs/AttributeReference.html#noescape + * As of writing this, only supported by clang. + */ +#if XXH_HAS_ATTRIBUTE(noescape) +# define XXH_NOESCAPE __attribute__((noescape)) +#else +# define XXH_NOESCAPE +#endif +/*! @endcond */ + + +/*! + * @} + * @ingroup public + * @{ + */ + +#ifndef XXH_NO_LONG_LONG +/*-********************************************************************** +* 64-bit hash +************************************************************************/ +#if defined(XXH_DOXYGEN) /* don't include <stdint.h> */ +/*! + * @brief An unsigned 64-bit integer. + * + * Not necessarily defined to `uint64_t` but functionally equivalent. + */ +typedef uint64_t XXH64_hash_t; +#elif !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint64_t XXH64_hash_t; +#else +# include <limits.h> +# if defined(__LP64__) && ULONG_MAX == 0xFFFFFFFFFFFFFFFFULL + /* LP64 ABI says uint64_t is unsigned long */ + typedef unsigned long XXH64_hash_t; +# else + /* the following type must have a width of 64-bit */ + typedef unsigned long long XXH64_hash_t; +# endif +#endif + +/*! + * @} + * + * @defgroup XXH64_family XXH64 family + * @ingroup public + * @{ + * Contains functions used in the classic 64-bit xxHash algorithm. + * + * @note + * XXH3 provides competitive speed for both 32-bit and 64-bit systems, + * and offers true 64/128 bit hash results. + * It provides better speed for systems with vector processing capabilities. + */ + +/*! + * @brief Calculates the 64-bit hash of @p input using xxHash64. + * + * This function usually runs faster on 64-bit systems, but slower on 32-bit + * systems (see benchmark). + * + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * @param seed The 64-bit seed to alter the hash's output predictably. + * + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return The calculated 64-bit hash. + * + * @see + * XXH32(), XXH3_64bits_withSeed(), XXH3_128bits_withSeed(), XXH128(): + * Direct equivalents for the other variants of xxHash. + * @see + * XXH64_createState(), XXH64_update(), XXH64_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/*! + * @brief The opaque state struct for the XXH64 streaming API. + * + * @see XXH64_state_s for details. + */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! + * @brief Allocates an @ref XXH64_state_t. + * + * Must be freed with XXH64_freeState(). + * @return An allocated XXH64_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH_MALLOCF XXH64_state_t* XXH64_createState(void); + +/*! + * @brief Frees an @ref XXH64_state_t. + * + * Must be allocated with XXH64_createState(). + * @param statePtr A pointer to an @ref XXH64_state_t allocated with @ref XXH64_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH64_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dst_state, const XXH64_state_t* src_state); + +/*! + * @brief Resets an @ref XXH64_state_t to begin a new hash. + * + * This function resets and seeds a state. Call it before @ref XXH64_update(). + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the hash result predictably. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed); + +/*! + * @brief Consumes a block of @p input to an @ref XXH64_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH_NOESCAPE XXH64_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated hash value from an @ref XXH64_state_t. + * + * @note + * Calling XXH64_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated xxHash64 value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_digest (XXH_NOESCAPE const XXH64_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ +/******* Canonical representation *******/ + +/*! + * @brief Canonical (big endian) representation of @ref XXH64_hash_t. + */ +typedef struct { unsigned char digest[sizeof(XXH64_hash_t)]; } XXH64_canonical_t; + +/*! + * @brief Converts an @ref XXH64_hash_t to a big endian @ref XXH64_canonical_t. + * + * @param dst The @ref XXH64_canonical_t pointer to be stored to. + * @param hash The @ref XXH64_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash); + +/*! + * @brief Converts an @ref XXH64_canonical_t to a native @ref XXH64_hash_t. + * + * @param src The @ref XXH64_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src); + +#ifndef XXH_NO_XXH3 + +/*! + * @} + * ************************************************************************ + * @defgroup XXH3_family XXH3 family + * @ingroup public + * @{ + * + * XXH3 is a more recent hash algorithm featuring: + * - Improved speed for both small and large inputs + * - True 64-bit and 128-bit outputs + * - SIMD acceleration + * - Improved 32-bit viability + * + * Speed analysis methodology is explained here: + * + * https://fastcompression.blogspot.com/2019/03/presenting-xxh3.html + * + * Compared to XXH64, expect XXH3 to run approximately + * ~2x faster on large inputs and >3x faster on small ones, + * exact differences vary depending on platform. + * + * XXH3's speed benefits greatly from SIMD and 64-bit arithmetic, + * but does not require it. + * Most 32-bit and 64-bit targets that can run XXH32 smoothly can run XXH3 + * at competitive speeds, even without vector support. Further details are + * explained in the implementation. + * + * XXH3 has a fast scalar implementation, but it also includes accelerated SIMD + * implementations for many common platforms: + * - AVX512 + * - AVX2 + * - SSE2 + * - ARM NEON + * - WebAssembly SIMD128 + * - POWER8 VSX + * - s390x ZVector + * This can be controlled via the @ref XXH_VECTOR macro, but it automatically + * selects the best version according to predefined macros. For the x86 family, an + * automatic runtime dispatcher is included separately in @ref xxh_x86dispatch.c. + * + * XXH3 implementation is portable: + * it has a generic C90 formulation that can be compiled on any platform, + * all implementations generate exactly the same hash value on all platforms. + * Starting from v0.8.0, it's also labelled "stable", meaning that + * any future version will also generate the same hash value. + * + * XXH3 offers 2 variants, _64bits and _128bits. + * + * When only 64 bits are needed, prefer invoking the _64bits variant, as it + * reduces the amount of mixing, resulting in faster speed on small inputs. + * It's also generally simpler to manipulate a scalar return type than a struct. + * + * The API supports one-shot hashing, streaming mode, and custom secrets. + */ +/*-********************************************************************** +* XXH3 64-bit variant +************************************************************************/ + +/*! + * @brief 64-bit unseeded variant of XXH3. + * + * This is equivalent to @ref XXH3_64bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_128bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_64bits_withSeed(), XXH3_64bits_withSecret(): other seeding variants + * @see + * XXH3_64bits_reset(), XXH3_64bits_update(), XXH3_64bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief 64-bit seeded variant of XXH3 + * + * This variant generates a custom secret on the fly based on default secret + * altered using the `seed` value. + * + * While this operation is decently fast, note that it's not completely free. + * + * @note + * seed == 0 produces the same results as @ref XXH3_64bits(). + * + * @param input The data to hash + * @param length The length + * @param seed The 64-bit seed to alter the state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed); + +/*! + * The bare minimum size for a custom secret. + * + * @see + * XXH3_64bits_withSecret(), XXH3_64bits_reset_withSecret(), + * XXH3_128bits_withSecret(), XXH3_128bits_reset_withSecret(). + */ +#define XXH3_SECRET_SIZE_MIN 136 + +/*! + * @brief 64-bit variant of XXH3 with a custom "secret". + * + * It's possible to provide any blob of bytes as a "secret" to generate the hash. + * This makes it more difficult for an external actor to prepare an intentional collision. + * The main condition is that secretSize *must* be large enough (>= XXH3_SECRET_SIZE_MIN). + * However, the quality of the secret impacts the dispersion of the hash algorithm. + * Therefore, the secret _must_ look like a bunch of random bytes. + * Avoid "trivial" or structured data such as repeated sequences or a text document. + * Whenever in doubt about the "randomness" of the blob of bytes, + * consider employing "XXH3_generateSecret()" instead (see below). + * It will generate a proper high entropy secret derived from the blob of bytes. + * Another advantage of using XXH3_generateSecret() is that + * it guarantees that all bits within the initial blob of bytes + * will impact every bit of the output. + * This is not necessarily the case when using the blob of bytes directly + * because, when hashing _small_ inputs, only a portion of the secret is employed. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + */ + +/*! + * @brief The state struct for the XXH3 streaming API. + * + * @see XXH3_state_s for details. + */ +typedef struct XXH3_state_s XXH3_state_t; +XXH_PUBLIC_API XXH_MALLOCF XXH3_state_t* XXH3_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr); + +/*! + * @brief Copies one @ref XXH3_state_t to another. + * + * @param dst_state The state to copy to. + * @param src_state The state to copy from. + * @pre + * @p dst_state and @p src_state must not be `NULL` and must not overlap. + */ +XXH_PUBLIC_API void XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state); + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_64bits_update(). + * Digest will be equivalent to `XXH3_64bits()`. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_64bits_update(). + * Digest will be equivalent to `XXH3_64bits_withSeed()`. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the state. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); + +/*! + * XXH3_64bits_reset_withSecret(): + * `secret` is referenced, it _must outlive_ the hash streaming session. + * Similar to one-shot API, `secretSize` must be >= `XXH3_SECRET_SIZE_MIN`, + * and the quality of produced hash values depends on secret's entropy + * (secret's content should look like a bunch of random bytes). + * When in doubt about the randomness of a candidate `secret`, + * consider employing `XXH3_generateSecret()` instead (see below). + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_64bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 64-bit hash value from an @ref XXH3_state_t. + * + * @note + * Calling XXH3_64bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 64-bit hash value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* note : canonical representation of XXH3 is the same as XXH64 + * since they both produce XXH64_hash_t values */ + + +/*-********************************************************************** +* XXH3 128-bit variant +************************************************************************/ + +/*! + * @brief The return value from 128-bit hashes. + * + * Stored in little endian order, although the fields themselves are in native + * endianness. + */ +typedef struct { + XXH64_hash_t low64; /*!< `value & 0xFFFFFFFFFFFFFFFF` */ + XXH64_hash_t high64; /*!< `value >> 64` */ +} XXH128_hash_t; + +/*! + * @brief Unseeded 128-bit variant of XXH3 + * + * The 128-bit variant of XXH3 has more strength, but it has a bit of overhead + * for shorter inputs. + * + * This is equivalent to @ref XXH3_128bits_withSeed() with a seed of 0, however + * it may have slightly better performance due to constant propagation of the + * defaults. + * + * @see + * XXH32(), XXH64(), XXH3_64bits(): equivalent for the other xxHash algorithms + * @see + * XXH3_128bits_withSeed(), XXH3_128bits_withSecret(): other seeding variants + * @see + * XXH3_128bits_reset(), XXH3_128bits_update(), XXH3_128bits_digest(): Streaming version. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* data, size_t len); +/*! @brief Seeded 128-bit variant of XXH3. @see XXH3_64bits_withSeed(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSeed(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH3_64bits_withSecret(). */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_withSecret(XXH_NOESCAPE const void* data, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize); + +/******* Streaming *******/ +#ifndef XXH_NO_STREAM +/* + * Streaming requires state maintenance. + * This operation costs memory and CPU. + * As a consequence, streaming is slower than one-shot hashing. + * For better performance, prefer one-shot functions whenever applicable. + * + * XXH3_128bits uses the same XXH3_state_t as XXH3_64bits(). + * Use already declared XXH3_createState() and XXH3_freeState(). + * + * All reset and streaming functions have same meaning as their 64-bit counterpart. + */ + +/*! + * @brief Resets an @ref XXH3_state_t to begin a new hash. + * + * This function resets `statePtr` and generate a secret with default parameters. Call it before @ref XXH3_128bits_update(). + * Digest will be equivalent to `XXH3_128bits()`. + * + * @param statePtr The state struct to reset. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr); + +/*! + * @brief Resets an @ref XXH3_state_t with 64-bit seed to begin a new hash. + * + * This function resets `statePtr` and generate a secret from `seed`. Call it before @ref XXH3_128bits_update(). + * Digest will be equivalent to `XXH3_128bits_withSeed()`. + * + * @param statePtr The state struct to reset. + * @param seed The 64-bit seed to alter the state. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + * + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed); +/*! @brief Custom secret 128-bit variant of XXH3. @see XXH_64bits_reset_withSecret(). */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize); + +/*! + * @brief Consumes a block of @p input to an @ref XXH3_state_t. + * + * Call this to incrementally consume blocks of data. + * + * @param statePtr The state struct to update. + * @param input The block of data to be hashed, at least @p length bytes in size. + * @param length The length of @p input, in bytes. + * + * @pre + * @p statePtr must not be `NULL`. + * @pre + * The memory between @p input and @p input + @p length must be valid, + * readable, contiguous memory. However, if @p length is `0`, @p input may be + * `NULL`. In C++, this also must be *TriviallyCopyable*. + * + * @return @ref XXH_OK on success, @ref XXH_ERROR on failure. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_128bits_update (XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* input, size_t length); + +/*! + * @brief Returns the calculated XXH3 128-bit hash value from an @ref XXH3_state_t. + * + * @note + * Calling XXH3_128bits_digest() will not affect @p statePtr, so you can update, + * digest, and update again. + * + * @param statePtr The state struct to calculate the hash from. + * + * @pre + * @p statePtr must not be `NULL`. + * + * @return The calculated XXH3 128-bit hash value from that state. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* statePtr); +#endif /* !XXH_NO_STREAM */ + +/* Following helper functions make it possible to compare XXH128_hast_t values. + * Since XXH128_hash_t is a structure, this capability is not offered by the language. + * Note: For better performance, these functions can be inlined using XXH_INLINE_ALL */ + +/*! + * XXH128_isEqual(): + * Return: 1 if `h1` and `h2` are equal, 0 if they are not. + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2); + +/*! + * @brief Compares two @ref XXH128_hash_t + * This comparator is compatible with stdlib's `qsort()`/`bsearch()`. + * + * @return: >0 if *h128_1 > *h128_2 + * =0 if *h128_1 == *h128_2 + * <0 if *h128_1 < *h128_2 + */ +XXH_PUBLIC_API XXH_PUREF int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2); + + +/******* Canonical representation *******/ +typedef struct { unsigned char digest[sizeof(XXH128_hash_t)]; } XXH128_canonical_t; + + +/*! + * @brief Converts an @ref XXH128_hash_t to a big endian @ref XXH128_canonical_t. + * + * @param dst The @ref XXH128_canonical_t pointer to be stored to. + * @param hash The @ref XXH128_hash_t to be converted. + * + * @pre + * @p dst must not be `NULL`. + */ +XXH_PUBLIC_API void XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash); + +/*! + * @brief Converts an @ref XXH128_canonical_t to a native @ref XXH128_hash_t. + * + * @param src The @ref XXH128_canonical_t to convert. + * + * @pre + * @p src must not be `NULL`. + * + * @return The converted hash. + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src); + + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ + +/*! + * @} + */ +#endif /* XXHASH_H_5627135585666179 */ + + + +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) +#define XXHASH_H_STATIC_13879238742 +/* **************************************************************************** + * This section contains declarations which are not guaranteed to remain stable. + * They may change in future versions, becoming incompatible with a different + * version of the library. + * These declarations should only be used with static linking. + * Never use them in association with dynamic linking! + ***************************************************************************** */ + +/* + * These definitions are only present to allow static allocation + * of XXH states, on stack or in a struct, for example. + * Never **ever** access their members directly. + */ + +/*! + * @internal + * @brief Structure for XXH32 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH32_state_t. + * Do not access the members of this struct directly. + * @see XXH64_state_s, XXH3_state_s + */ +struct XXH32_state_s { + XXH32_hash_t total_len_32; /*!< Total length hashed, modulo 2^32 */ + XXH32_hash_t large_len; /*!< Whether the hash is >= 16 (handles @ref total_len_32 overflow) */ + XXH32_hash_t v[4]; /*!< Accumulator lanes */ + XXH32_hash_t mem32[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[16]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem32 */ + XXH32_hash_t reserved; /*!< Reserved field. Do not read nor write to it. */ +}; /* typedef'd to XXH32_state_t */ + + +#ifndef XXH_NO_LONG_LONG /* defined when there is no 64-bit support */ + +/*! + * @internal + * @brief Structure for XXH64 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. Otherwise it is + * an opaque type. This allows fields to safely be changed. + * + * Typedef'd to @ref XXH64_state_t. + * Do not access the members of this struct directly. + * @see XXH32_state_s, XXH3_state_s + */ +struct XXH64_state_s { + XXH64_hash_t total_len; /*!< Total length hashed. This is always 64-bit. */ + XXH64_hash_t v[4]; /*!< Accumulator lanes */ + XXH64_hash_t mem64[4]; /*!< Internal buffer for partial reads. Treated as unsigned char[32]. */ + XXH32_hash_t memsize; /*!< Amount of data in @ref mem64 */ + XXH32_hash_t reserved32; /*!< Reserved field, needed for padding anyways*/ + XXH64_hash_t reserved64; /*!< Reserved field. Do not read or write to it. */ +}; /* typedef'd to XXH64_state_t */ + +#ifndef XXH_NO_XXH3 + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* >= C11 */ +# include <stdalign.h> +# define XXH_ALIGN(n) alignas(n) +#elif defined(__cplusplus) && (__cplusplus >= 201103L) /* >= C++11 */ +/* In C++ alignas() is a keyword */ +# define XXH_ALIGN(n) alignas(n) +#elif defined(__GNUC__) +# define XXH_ALIGN(n) __attribute__ ((aligned(n))) +#elif defined(_MSC_VER) +# define XXH_ALIGN(n) __declspec(align(n)) +#else +# define XXH_ALIGN(n) /* disabled */ +#endif + +/* Old GCC versions only accept the attribute after the type in structures. */ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) /* C11+ */ \ + && ! (defined(__cplusplus) && (__cplusplus >= 201103L)) /* >= C++11 */ \ + && defined(__GNUC__) +# define XXH_ALIGN_MEMBER(align, type) type XXH_ALIGN(align) +#else +# define XXH_ALIGN_MEMBER(align, type) XXH_ALIGN(align) type +#endif + +/*! + * @brief The size of the internal XXH3 buffer. + * + * This is the optimal update size for incremental hashing. + * + * @see XXH3_64b_update(), XXH3_128b_update(). + */ +#define XXH3_INTERNALBUFFER_SIZE 256 + +/*! + * @internal + * @brief Default size of the secret buffer (and @ref XXH3_kSecret). + * + * This is the size used in @ref XXH3_kSecret and the seeded functions. + * + * Not to be confused with @ref XXH3_SECRET_SIZE_MIN. + */ +#define XXH3_SECRET_DEFAULT_SIZE 192 + +/*! + * @internal + * @brief Structure for XXH3 streaming API. + * + * @note This is only defined when @ref XXH_STATIC_LINKING_ONLY, + * @ref XXH_INLINE_ALL, or @ref XXH_IMPLEMENTATION is defined. + * Otherwise it is an opaque type. + * Never use this definition in combination with dynamic library. + * This allows fields to safely be changed in the future. + * + * @note ** This structure has a strict alignment requirement of 64 bytes!! ** + * Do not allocate this with `malloc()` or `new`, + * it will not be sufficiently aligned. + * Use @ref XXH3_createState() and @ref XXH3_freeState(), or stack allocation. + * + * Typedef'd to @ref XXH3_state_t. + * Do never access the members of this struct directly. + * + * @see XXH3_INITSTATE() for stack initialization. + * @see XXH3_createState(), XXH3_freeState(). + * @see XXH32_state_s, XXH64_state_s + */ +struct XXH3_state_s { + XXH_ALIGN_MEMBER(64, XXH64_hash_t acc[8]); + /*!< The 8 accumulators. See @ref XXH32_state_s::v and @ref XXH64_state_s::v */ + XXH_ALIGN_MEMBER(64, unsigned char customSecret[XXH3_SECRET_DEFAULT_SIZE]); + /*!< Used to store a custom secret generated from a seed. */ + XXH_ALIGN_MEMBER(64, unsigned char buffer[XXH3_INTERNALBUFFER_SIZE]); + /*!< The internal buffer. @see XXH32_state_s::mem32 */ + XXH32_hash_t bufferedSize; + /*!< The amount of memory in @ref buffer, @see XXH32_state_s::memsize */ + XXH32_hash_t useSeed; + /*!< Reserved field. Needed for padding on 64-bit. */ + size_t nbStripesSoFar; + /*!< Number or stripes processed. */ + XXH64_hash_t totalLen; + /*!< Total length hashed. 64-bit even on 32-bit targets. */ + size_t nbStripesPerBlock; + /*!< Number of stripes per block. */ + size_t secretLimit; + /*!< Size of @ref customSecret or @ref extSecret */ + XXH64_hash_t seed; + /*!< Seed for _withSeed variants. Must be zero otherwise, @see XXH3_INITSTATE() */ + XXH64_hash_t reserved64; + /*!< Reserved field. */ + const unsigned char* extSecret; + /*!< Reference to an external secret for the _withSecret variants, NULL + * for other variants. */ + /* note: there may be some padding at the end due to alignment on 64 bytes */ +}; /* typedef'd to XXH3_state_t */ + +#undef XXH_ALIGN_MEMBER + +/*! + * @brief Initializes a stack-allocated `XXH3_state_s`. + * + * When the @ref XXH3_state_t structure is merely emplaced on stack, + * it should be initialized with XXH3_INITSTATE() or a memset() + * in case its first reset uses XXH3_NNbits_reset_withSeed(). + * This init can be omitted if the first reset uses default or _withSecret mode. + * This operation isn't necessary when the state is created with XXH3_createState(). + * Note that this doesn't prepare the state for a streaming operation, + * it's still necessary to use XXH3_NNbits_reset*() afterwards. + */ +#define XXH3_INITSTATE(XXH3_state_ptr) \ + do { \ + XXH3_state_t* tmp_xxh3_state_ptr = (XXH3_state_ptr); \ + tmp_xxh3_state_ptr->seed = 0; \ + tmp_xxh3_state_ptr->extSecret = NULL; \ + } while(0) + + +/*! + * simple alias to pre-selected XXH3_128bits variant + */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t XXH128(XXH_NOESCAPE const void* data, size_t len, XXH64_hash_t seed); + + +/* === Experimental API === */ +/* Symbols defined below must be considered tied to a specific library version. */ + +/*! + * XXH3_generateSecret(): + * + * Derive a high-entropy secret from any user-defined content, named customSeed. + * The generated secret can be used in combination with `*_withSecret()` functions. + * The `_withSecret()` variants are useful to provide a higher level of protection + * than 64-bit seed, as it becomes much more difficult for an external actor to + * guess how to impact the calculation logic. + * + * The function accepts as input a custom seed of any length and any content, + * and derives from it a high-entropy secret of length @p secretSize into an + * already allocated buffer @p secretBuffer. + * + * The generated secret can then be used with any `*_withSecret()` variant. + * The functions @ref XXH3_128bits_withSecret(), @ref XXH3_64bits_withSecret(), + * @ref XXH3_128bits_reset_withSecret() and @ref XXH3_64bits_reset_withSecret() + * are part of this list. They all accept a `secret` parameter + * which must be large enough for implementation reasons (>= @ref XXH3_SECRET_SIZE_MIN) + * _and_ feature very high entropy (consist of random-looking bytes). + * These conditions can be a high bar to meet, so @ref XXH3_generateSecret() can + * be employed to ensure proper quality. + * + * @p customSeed can be anything. It can have any size, even small ones, + * and its content can be anything, even "poor entropy" sources such as a bunch + * of zeroes. The resulting `secret` will nonetheless provide all required qualities. + * + * @pre + * - @p secretSize must be >= @ref XXH3_SECRET_SIZE_MIN + * - When @p customSeedSize > 0, supplying NULL as customSeed is undefined behavior. + * + * Example code: + * @code{.c} + * #include <stdio.h> + * #include <stdlib.h> + * #include <string.h> + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Hashes argv[2] using the entropy from argv[1]. + * int main(int argc, char* argv[]) + * { + * char secret[XXH3_SECRET_SIZE_MIN]; + * if (argv != 3) { return 1; } + * XXH3_generateSecret(secret, sizeof(secret), argv[1], strlen(argv[1])); + * XXH64_hash_t h = XXH3_64bits_withSecret( + * argv[2], strlen(argv[2]), + * secret, sizeof(secret) + * ); + * printf("%016llx\n", (unsigned long long) h); + * } + * @endcode + */ +XXH_PUBLIC_API XXH_errorcode XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize); + +/*! + * @brief Generate the same secret as the _withSeed() variants. + * + * The generated secret can be used in combination with + *`*_withSecret()` and `_withSecretandSeed()` variants. + * + * Example C++ `std::string` hash class: + * @code{.cpp} + * #include <string> + * #define XXH_STATIC_LINKING_ONLY // expose unstable API + * #include "xxhash.h" + * // Slow, seeds each time + * class HashSlow { + * XXH64_hash_t seed; + * public: + * HashSlow(XXH64_hash_t s) : seed{s} {} + * size_t operator()(const std::string& x) const { + * return size_t{XXH3_64bits_withSeed(x.c_str(), x.length(), seed)}; + * } + * }; + * // Fast, caches the seeded secret for future uses. + * class HashFast { + * unsigned char secret[XXH3_SECRET_SIZE_MIN]; + * public: + * HashFast(XXH64_hash_t s) { + * XXH3_generateSecret_fromSeed(secret, seed); + * } + * size_t operator()(const std::string& x) const { + * return size_t{ + * XXH3_64bits_withSecret(x.c_str(), x.length(), secret, sizeof(secret)) + * }; + * } + * }; + * @endcode + * @param secretBuffer A writable buffer of @ref XXH3_SECRET_SIZE_MIN bytes + * @param seed The seed to seed the state. + */ +XXH_PUBLIC_API void XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed); + +/*! + * These variants generate hash values using either + * @p seed for "short" keys (< XXH3_MIDSIZE_MAX = 240 bytes) + * or @p secret for "large" keys (>= XXH3_MIDSIZE_MAX). + * + * This generally benefits speed, compared to `_withSeed()` or `_withSecret()`. + * `_withSeed()` has to generate the secret on the fly for "large" keys. + * It's fast, but can be perceptible for "not so large" keys (< 1 KB). + * `_withSecret()` has to generate the masks on the fly for "small" keys, + * which requires more instructions than _withSeed() variants. + * Therefore, _withSecretandSeed variant combines the best of both worlds. + * + * When @p secret has been generated by XXH3_generateSecret_fromSeed(), + * this variant produces *exactly* the same results as `_withSeed()` variant, + * hence offering only a pure speed benefit on "large" input, + * by skipping the need to regenerate the secret for every large input. + * + * Another usage scenario is to hash the secret to a 64-bit hash value, + * for example with XXH3_64bits(), which then becomes the seed, + * and then employ both the seed and the secret in _withSecretandSeed(). + * On top of speed, an added benefit is that each bit in the secret + * has a 50% chance to swap each bit in the output, via its impact to the seed. + * + * This is not guaranteed when using the secret directly in "small data" scenarios, + * because only portions of the secret are employed for small data. + */ +XXH_PUBLIC_API XXH_PUREF XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* data, size_t len, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_PUREF XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#ifndef XXH_NO_STREAM +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +/*! @copydoc XXH3_64bits_withSecretandSeed() */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, + XXH_NOESCAPE const void* secret, size_t secretSize, + XXH64_hash_t seed64); +#endif /* !XXH_NO_STREAM */ + +#endif /* !XXH_NO_XXH3 */ +#endif /* XXH_NO_LONG_LONG */ +#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) +# define XXH_IMPLEMENTATION +#endif + +#endif /* defined(XXH_STATIC_LINKING_ONLY) && !defined(XXHASH_H_STATIC_13879238742) */ + + +/* ======================================================================== */ +/* ======================================================================== */ +/* ======================================================================== */ + + +/*-********************************************************************** + * xxHash implementation + *-********************************************************************** + * xxHash's implementation used to be hosted inside xxhash.c. + * + * However, inlining requires implementation to be visible to the compiler, + * hence be included alongside the header. + * Previously, implementation was hosted inside xxhash.c, + * which was then #included when inlining was activated. + * This construction created issues with a few build and install systems, + * as it required xxhash.c to be stored in /include directory. + * + * xxHash implementation is now directly integrated within xxhash.h. + * As a consequence, xxhash.c is no longer needed in /include. + * + * xxhash.c is still available and is still useful. + * In a "normal" setup, when xxhash is not inlined, + * xxhash.h only exposes the prototypes and public symbols, + * while xxhash.c can be built into an object file xxhash.o + * which can then be linked into the final binary. + ************************************************************************/ + +#if ( defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API) \ + || defined(XXH_IMPLEMENTATION) ) && !defined(XXH_IMPLEM_13a8737387) +# define XXH_IMPLEM_13a8737387 + +/* ************************************* +* Tuning parameters +***************************************/ + +/*! + * @defgroup tuning Tuning parameters + * @{ + * + * Various macros to control xxHash's behavior. + */ +#ifdef XXH_DOXYGEN +/*! + * @brief Define this to disable 64-bit code. + * + * Useful if only using the @ref XXH32_family and you have a strict C90 compiler. + */ +# define XXH_NO_LONG_LONG +# undef XXH_NO_LONG_LONG /* don't actually */ +/*! + * @brief Controls how unaligned memory is accessed. + * + * By default, access to unaligned memory is controlled by `memcpy()`, which is + * safe and portable. + * + * Unfortunately, on some target/compiler combinations, the generated assembly + * is sub-optimal. + * + * The below switch allow selection of a different access method + * in the search for improved performance. + * + * @par Possible options: + * + * - `XXH_FORCE_MEMORY_ACCESS=0` (default): `memcpy` + * @par + * Use `memcpy()`. Safe and portable. Note that most modern compilers will + * eliminate the function call and treat it as an unaligned access. + * + * - `XXH_FORCE_MEMORY_ACCESS=1`: `__attribute__((aligned(1)))` + * @par + * Depends on compiler extensions and is therefore not portable. + * This method is safe _if_ your compiler supports it, + * and *generally* as fast or faster than `memcpy`. + * + * - `XXH_FORCE_MEMORY_ACCESS=2`: Direct cast + * @par + * Casts directly and dereferences. This method doesn't depend on the + * compiler, but it violates the C standard as it directly dereferences an + * unaligned pointer. It can generate buggy code on targets which do not + * support unaligned memory accesses, but in some circumstances, it's the + * only known way to get the most performance. + * + * - `XXH_FORCE_MEMORY_ACCESS=3`: Byteshift + * @par + * Also portable. This can generate the best code on old compilers which don't + * inline small `memcpy()` calls, and it might also be faster on big-endian + * systems which lack a native byteswap instruction. However, some compilers + * will emit literal byteshifts even if the target supports unaligned access. + * + * + * @warning + * Methods 1 and 2 rely on implementation-defined behavior. Use these with + * care, as what works on one compiler/platform/optimization level may cause + * another to read garbage data or even crash. + * + * See https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html for details. + * + * Prefer these methods in priority order (0 > 3 > 1 > 2) + */ +# define XXH_FORCE_MEMORY_ACCESS 0 + +/*! + * @def XXH_SIZE_OPT + * @brief Controls how much xxHash optimizes for size. + * + * xxHash, when compiled, tends to result in a rather large binary size. This + * is mostly due to heavy usage to forced inlining and constant folding of the + * @ref XXH3_family to increase performance. + * + * However, some developers prefer size over speed. This option can + * significantly reduce the size of the generated code. When using the `-Os` + * or `-Oz` options on GCC or Clang, this is defined to 1 by default, + * otherwise it is defined to 0. + * + * Most of these size optimizations can be controlled manually. + * + * This is a number from 0-2. + * - `XXH_SIZE_OPT` == 0: Default. xxHash makes no size optimizations. Speed + * comes first. + * - `XXH_SIZE_OPT` == 1: Default for `-Os` and `-Oz`. xxHash is more + * conservative and disables hacks that increase code size. It implies the + * options @ref XXH_NO_INLINE_HINTS == 1, @ref XXH_FORCE_ALIGN_CHECK == 0, + * and @ref XXH3_NEON_LANES == 8 if they are not already defined. + * - `XXH_SIZE_OPT` == 2: xxHash tries to make itself as small as possible. + * Performance may cry. For example, the single shot functions just use the + * streaming API. + */ +# define XXH_SIZE_OPT 0 + +/*! + * @def XXH_FORCE_ALIGN_CHECK + * @brief If defined to non-zero, adds a special path for aligned inputs (XXH32() + * and XXH64() only). + * + * This is an important performance trick for architectures without decent + * unaligned memory access performance. + * + * It checks for input alignment, and when conditions are met, uses a "fast + * path" employing direct 32-bit/64-bit reads, resulting in _dramatically + * faster_ read speed. + * + * The check costs one initial branch per hash, which is generally negligible, + * but not zero. + * + * Moreover, it's not useful to generate an additional code path if memory + * access uses the same instruction for both aligned and unaligned + * addresses (e.g. x86 and aarch64). + * + * In these cases, the alignment check can be removed by setting this macro to 0. + * Then the code will always use unaligned memory access. + * Align check is automatically disabled on x86, x64, ARM64, and some ARM chips + * which are platforms known to offer good unaligned memory accesses performance. + * + * It is also disabled by default when @ref XXH_SIZE_OPT >= 1. + * + * This option does not affect XXH3 (only XXH32 and XXH64). + */ +# define XXH_FORCE_ALIGN_CHECK 0 + +/*! + * @def XXH_NO_INLINE_HINTS + * @brief When non-zero, sets all functions to `static`. + * + * By default, xxHash tries to force the compiler to inline almost all internal + * functions. + * + * This can usually improve performance due to reduced jumping and improved + * constant folding, but significantly increases the size of the binary which + * might not be favorable. + * + * Additionally, sometimes the forced inlining can be detrimental to performance, + * depending on the architecture. + * + * XXH_NO_INLINE_HINTS marks all internal functions as static, giving the + * compiler full control on whether to inline or not. + * + * When not optimizing (-O0), using `-fno-inline` with GCC or Clang, or if + * @ref XXH_SIZE_OPT >= 1, this will automatically be defined. + */ +# define XXH_NO_INLINE_HINTS 0 + +/*! + * @def XXH3_INLINE_SECRET + * @brief Determines whether to inline the XXH3 withSecret code. + * + * When the secret size is known, the compiler can improve the performance + * of XXH3_64bits_withSecret() and XXH3_128bits_withSecret(). + * + * However, if the secret size is not known, it doesn't have any benefit. This + * happens when xxHash is compiled into a global symbol. Therefore, if + * @ref XXH_INLINE_ALL is *not* defined, this will be defined to 0. + * + * Additionally, this defaults to 0 on GCC 12+, which has an issue with function pointers + * that are *sometimes* force inline on -Og, and it is impossible to automatically + * detect this optimization level. + */ +# define XXH3_INLINE_SECRET 0 + +/*! + * @def XXH32_ENDJMP + * @brief Whether to use a jump for `XXH32_finalize`. + * + * For performance, `XXH32_finalize` uses multiple branches in the finalizer. + * This is generally preferable for performance, + * but depending on exact architecture, a jmp may be preferable. + * + * This setting is only possibly making a difference for very small inputs. + */ +# define XXH32_ENDJMP 0 + +/*! + * @internal + * @brief Redefines old internal names. + * + * For compatibility with code that uses xxHash's internals before the names + * were changed to improve namespacing. There is no other reason to use this. + */ +# define XXH_OLD_NAMES +# undef XXH_OLD_NAMES /* don't actually use, it is ugly. */ + +/*! + * @def XXH_NO_STREAM + * @brief Disables the streaming API. + * + * When xxHash is not inlined and the streaming functions are not used, disabling + * the streaming functions can improve code size significantly, especially with + * the @ref XXH3_family which tends to make constant folded copies of itself. + */ +# define XXH_NO_STREAM +# undef XXH_NO_STREAM /* don't actually */ +#endif /* XXH_DOXYGEN */ +/*! + * @} + */ + +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ + /* prefer __packed__ structures (method 1) for GCC + * < ARMv7 with unaligned access (e.g. Raspbian armhf) still uses byte shifting, so we use memcpy + * which for some reason does unaligned loads. */ +# if defined(__GNUC__) && !(defined(__ARM_ARCH) && __ARM_ARCH < 7 && defined(__ARM_FEATURE_UNALIGNED)) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +#ifndef XXH_SIZE_OPT + /* default to 1 for -Os or -Oz */ +# if (defined(__GNUC__) || defined(__clang__)) && defined(__OPTIMIZE_SIZE__) +# define XXH_SIZE_OPT 1 +# else +# define XXH_SIZE_OPT 0 +# endif +#endif + +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ + /* don't check on sizeopt, x86, aarch64, or arm when unaligned access is available */ +# if XXH_SIZE_OPT >= 1 || \ + defined(__i386) || defined(__x86_64__) || defined(__aarch64__) || defined(__ARM_FEATURE_UNALIGNED) \ + || defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64) || defined(_M_ARM) /* visual */ +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + +#ifndef XXH_NO_INLINE_HINTS +# if XXH_SIZE_OPT >= 1 || defined(__NO_INLINE__) /* -O0, -fno-inline */ +# define XXH_NO_INLINE_HINTS 1 +# else +# define XXH_NO_INLINE_HINTS 0 +# endif +#endif + +#ifndef XXH3_INLINE_SECRET +# if (defined(__GNUC__) && !defined(__clang__) && __GNUC__ >= 12) \ + || !defined(XXH_INLINE_ALL) +# define XXH3_INLINE_SECRET 0 +# else +# define XXH3_INLINE_SECRET 1 +# endif +#endif + +#ifndef XXH32_ENDJMP +/* generally preferable for performance */ +# define XXH32_ENDJMP 0 +#endif + +/*! + * @defgroup impl Implementation + * @{ + */ + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +#if defined(XXH_NO_STREAM) +/* nothing */ +#elif defined(XXH_NO_STDLIB) + +/* When requesting to disable any mention of stdlib, + * the library loses the ability to invoked malloc / free. + * In practice, it means that functions like `XXH*_createState()` + * will always fail, and return NULL. + * This flag is useful in situations where + * xxhash.h is integrated into some kernel, embedded or limited environment + * without access to dynamic allocation. + */ + +static XXH_CONSTF void* XXH_malloc(size_t s) { (void)s; return NULL; } +static void XXH_free(void* p) { (void)p; } + +#else + +/* + * Modify the local functions below should you wish to use + * different memory routines for malloc() and free() + */ +#include <stdlib.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than malloc(). + */ +static XXH_MALLOCF void* XXH_malloc(size_t s) { return malloc(s); } + +/*! + * @internal + * @brief Modify this function to use a different routine than free(). + */ +static void XXH_free(void* p) { free(p); } + +#endif /* XXH_NO_STDLIB */ + +#include <string.h> + +/*! + * @internal + * @brief Modify this function to use a different routine than memcpy(). + */ +static void* XXH_memcpy(void* dest, const void* src, size_t size) +{ + return memcpy(dest,src,size); +} + +#include <limits.h> /* ULLONG_MAX */ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#ifdef _MSC_VER /* Visual Studio warning fix */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + +#if XXH_NO_INLINE_HINTS /* disable inlining hints */ +# if defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __attribute__((unused)) +# else +# define XXH_FORCE_INLINE static +# endif +# define XXH_NO_INLINE static +/* enable inlining hints */ +#elif defined(__GNUC__) || defined(__clang__) +# define XXH_FORCE_INLINE static __inline__ __attribute__((always_inline, unused)) +# define XXH_NO_INLINE static __attribute__((noinline)) +#elif defined(_MSC_VER) /* Visual Studio */ +# define XXH_FORCE_INLINE static __forceinline +# define XXH_NO_INLINE static __declspec(noinline) +#elif defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* C99 */ +# define XXH_FORCE_INLINE static inline +# define XXH_NO_INLINE static +#else +# define XXH_FORCE_INLINE static +# define XXH_NO_INLINE static +#endif + +#if XXH3_INLINE_SECRET +# define XXH3_WITH_SECRET_INLINE XXH_FORCE_INLINE +#else +# define XXH3_WITH_SECRET_INLINE XXH_NO_INLINE +#endif + + +/* ************************************* +* Debug +***************************************/ +/*! + * @ingroup tuning + * @def XXH_DEBUGLEVEL + * @brief Sets the debugging level. + * + * XXH_DEBUGLEVEL is expected to be defined externally, typically via the + * compiler's command line options. The value must be a number. + */ +#ifndef XXH_DEBUGLEVEL +# ifdef DEBUGLEVEL /* backwards compat */ +# define XXH_DEBUGLEVEL DEBUGLEVEL +# else +# define XXH_DEBUGLEVEL 0 +# endif +#endif + +#if (XXH_DEBUGLEVEL>=1) +# include <assert.h> /* note: can still be disabled with NDEBUG */ +# define XXH_ASSERT(c) assert(c) +#else +# if defined(__INTEL_COMPILER) +# define XXH_ASSERT(c) XXH_ASSUME((unsigned char) (c)) +# else +# define XXH_ASSERT(c) XXH_ASSUME(c) +# endif +#endif + +/* note: use after variable declarations */ +#ifndef XXH_STATIC_ASSERT +# if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) /* C11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { _Static_assert((c),m); } while(0) +# elif defined(__cplusplus) && (__cplusplus >= 201103L) /* C++11 */ +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { static_assert((c),m); } while(0) +# else +# define XXH_STATIC_ASSERT_WITH_MESSAGE(c,m) do { struct xxh_sa { char x[(c) ? 1 : -1]; }; } while(0) +# endif +# define XXH_STATIC_ASSERT(c) XXH_STATIC_ASSERT_WITH_MESSAGE((c),#c) +#endif + +/*! + * @internal + * @def XXH_COMPILER_GUARD(var) + * @brief Used to prevent unwanted optimizations for @p var. + * + * It uses an empty GCC inline assembly statement with a register constraint + * which forces @p var into a general purpose register (eg eax, ebx, ecx + * on x86) and marks it as modified. + * + * This is used in a few places to avoid unwanted autovectorization (e.g. + * XXH32_round()). All vectorization we want is explicit via intrinsics, + * and _usually_ isn't wanted elsewhere. + * + * We also use it to prevent unwanted constant folding for AArch64 in + * XXH3_initCustomSecret_scalar(). + */ +#if defined(__GNUC__) || defined(__clang__) +# define XXH_COMPILER_GUARD(var) __asm__("" : "+r" (var)) +#else +# define XXH_COMPILER_GUARD(var) ((void)0) +#endif + +/* Specifically for NEON vectors which use the "w" constraint, on + * Clang. */ +#if defined(__clang__) && defined(__ARM_ARCH) && !defined(__wasm__) +# define XXH_COMPILER_GUARD_CLANG_NEON(var) __asm__("" : "+w" (var)) +#else +# define XXH_COMPILER_GUARD_CLANG_NEON(var) ((void)0) +#endif + +/* ************************************* +* Basic Types +***************************************/ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include <stdint.h> + typedef uint8_t xxh_u8; +#else + typedef unsigned char xxh_u8; +#endif +typedef XXH32_hash_t xxh_u32; + +#ifdef XXH_OLD_NAMES +# warning "XXH_OLD_NAMES is planned to be removed starting v0.9. If the program depends on it, consider moving away from it by employing newer type names directly" +# define BYTE xxh_u8 +# define U8 xxh_u8 +# define U32 xxh_u32 +#endif + +/* *** Memory access *** */ + +/*! + * @internal + * @fn xxh_u32 XXH_read32(const void* ptr) + * @brief Reads an unaligned 32-bit integer from @p ptr in native endianness. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit native endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32(const void* ptr) + * @brief Reads an unaligned 32-bit little endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readBE32(const void* ptr) + * @brief Reads an unaligned 32-bit big endian integer from @p ptr. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * + * @param ptr The pointer to read from. + * @return The 32-bit big endian integer from the bytes at @p ptr. + */ + +/*! + * @internal + * @fn xxh_u32 XXH_readLE32_align(const void* ptr, XXH_alignment align) + * @brief Like @ref XXH_readLE32(), but has an option for aligned reads. + * + * Affected by @ref XXH_FORCE_MEMORY_ACCESS. + * Note that when @ref XXH_FORCE_ALIGN_CHECK == 0, the @p align parameter is + * always @ref XXH_alignment::XXH_unaligned. + * + * @param ptr The pointer to read from. + * @param align Whether @p ptr is aligned. + * @pre + * If @p align == @ref XXH_alignment::XXH_aligned, @p ptr must be 4 byte + * aligned. + * @return The 32-bit little endian integer from the bytes at @p ptr. + */ + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE32 and XXH_readBE32. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* + * Force direct memory access. Only works on CPU which support unaligned memory + * access in hardware. + */ +static xxh_u32 XXH_read32(const void* memPtr) { return *(const xxh_u32*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; } __attribute__((packed)) unalign; +#endif +static xxh_u32 XXH_read32(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u32 xxh_unalign32; + return *((const xxh_unalign32*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u32 XXH_read32(const void* memPtr) +{ + xxh_u32 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* *** Endianness *** */ + +/*! + * @ingroup tuning + * @def XXH_CPU_LITTLE_ENDIAN + * @brief Whether the target is little endian. + * + * Defined to 1 if the target is little endian, or 0 if it is big endian. + * It can be defined externally, for example on the compiler command line. + * + * If it is not defined, + * a runtime check (which is usually constant folded) is used instead. + * + * @note + * This is not necessarily defined to an integer constant. + * + * @see XXH_isLittleEndian() for the runtime check. + */ +#ifndef XXH_CPU_LITTLE_ENDIAN +/* + * Try to detect endianness automatically, to avoid the nonstandard behavior + * in `XXH_isLittleEndian()` + */ +# if defined(_WIN32) /* Windows is always little endian */ \ + || defined(__LITTLE_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 1 +# elif defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_CPU_LITTLE_ENDIAN 0 +# else +/*! + * @internal + * @brief Runtime check for @ref XXH_CPU_LITTLE_ENDIAN. + * + * Most compilers will constant fold this. + */ +static int XXH_isLittleEndian(void) +{ + /* + * Portable and well-defined behavior. + * Don't use static: it is detrimental to performance. + */ + const union { xxh_u32 u; xxh_u8 c[4]; } one = { 1 }; + return one.c[0]; +} +# define XXH_CPU_LITTLE_ENDIAN XXH_isLittleEndian() +# endif +#endif + + + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define XXH_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#ifdef __has_builtin +# define XXH_HAS_BUILTIN(x) __has_builtin(x) +#else +# define XXH_HAS_BUILTIN(x) 0 +#endif + + + +/* + * C23 and future versions have standard "unreachable()". + * Once it has been implemented reliably we can add it as an + * additional case: + * + * ``` + * #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= XXH_C23_VN) + * # include <stddef.h> + * # ifdef unreachable + * # define XXH_UNREACHABLE() unreachable() + * # endif + * #endif + * ``` + * + * Note C++23 also has std::unreachable() which can be detected + * as follows: + * ``` + * #if defined(__cpp_lib_unreachable) && (__cpp_lib_unreachable >= 202202L) + * # include <utility> + * # define XXH_UNREACHABLE() std::unreachable() + * #endif + * ``` + * NB: `__cpp_lib_unreachable` is defined in the `<version>` header. + * We don't use that as including `<utility>` in `extern "C"` blocks + * doesn't work on GCC12 + */ + +#if XXH_HAS_BUILTIN(__builtin_unreachable) +# define XXH_UNREACHABLE() __builtin_unreachable() + +#elif defined(_MSC_VER) +# define XXH_UNREACHABLE() __assume(0) + +#else +# define XXH_UNREACHABLE() +#endif + +#if XXH_HAS_BUILTIN(__builtin_assume) +# define XXH_ASSUME(c) __builtin_assume(c) +#else +# define XXH_ASSUME(c) if (!(c)) { XXH_UNREACHABLE(); } +#endif + +/*! + * @internal + * @def XXH_rotl32(x,r) + * @brief 32-bit rotate left. + * + * @param x The 32-bit integer to be rotated. + * @param r The number of bits to rotate. + * @pre + * @p r > 0 && @p r < 32 + * @note + * @p x and @p r may be evaluated multiple times. + * @return The rotated result. + */ +#if !defined(NO_CLANG_BUILTIN) && XXH_HAS_BUILTIN(__builtin_rotateleft32) \ + && XXH_HAS_BUILTIN(__builtin_rotateleft64) +# define XXH_rotl32 __builtin_rotateleft32 +# define XXH_rotl64 __builtin_rotateleft64 +/* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */ +#elif defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +# define XXH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r)))) +# define XXH_rotl64(x,r) (((x) << (r)) | ((x) >> (64 - (r)))) +#endif + +/*! + * @internal + * @fn xxh_u32 XXH_swap32(xxh_u32 x) + * @brief A 32-bit byteswap. + * + * @param x The 32-bit integer to byteswap. + * @return @p x, byteswapped. + */ +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +#else +static xxh_u32 XXH_swap32 (xxh_u32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +#endif + + +/* *************************** +* Memory reads +*****************************/ + +/*! + * @internal + * @brief Enum to indicate whether a pointer is aligned. + */ +typedef enum { + XXH_aligned, /*!< Aligned */ + XXH_unaligned /*!< Possibly unaligned */ +} XXH_alignment; + +/* + * XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. + * + * This is ideal for older compilers which don't inline memcpy. + */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u32)bytePtr[1] << 8) + | ((xxh_u32)bytePtr[2] << 16) + | ((xxh_u32)bytePtr[3] << 24); +} + +XXH_FORCE_INLINE xxh_u32 XXH_readBE32(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[3] + | ((xxh_u32)bytePtr[2] << 8) + | ((xxh_u32)bytePtr[1] << 16) + | ((xxh_u32)bytePtr[0] << 24); +} + +#else +XXH_FORCE_INLINE xxh_u32 XXH_readLE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); +} + +static xxh_u32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u32 +XXH_readLE32_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) { + return XXH_readLE32(ptr); + } else { + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u32*)ptr : XXH_swap32(*(const xxh_u32*)ptr); + } +} + + +/* ************************************* +* Misc +***************************************/ +/*! @ingroup public */ +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ******************************************************************* +* 32-bit hash functions +*********************************************************************/ +/*! + * @} + * @defgroup XXH32_impl XXH32 implementation + * @ingroup impl + * + * Details on the XXH32 implementation. + * @{ + */ + /* #define instead of static const, to be used as initializers */ +#define XXH_PRIME32_1 0x9E3779B1U /*!< 0b10011110001101110111100110110001 */ +#define XXH_PRIME32_2 0x85EBCA77U /*!< 0b10000101111010111100101001110111 */ +#define XXH_PRIME32_3 0xC2B2AE3DU /*!< 0b11000010101100101010111000111101 */ +#define XXH_PRIME32_4 0x27D4EB2FU /*!< 0b00100111110101001110101100101111 */ +#define XXH_PRIME32_5 0x165667B1U /*!< 0b00010110010101100110011110110001 */ + +#ifdef XXH_OLD_NAMES +# define PRIME32_1 XXH_PRIME32_1 +# define PRIME32_2 XXH_PRIME32_2 +# define PRIME32_3 XXH_PRIME32_3 +# define PRIME32_4 XXH_PRIME32_4 +# define PRIME32_5 XXH_PRIME32_5 +#endif + +/*! + * @internal + * @brief Normal stripe processing routine. + * + * This shuffles the bits so that any bit from @p input impacts several bits in + * @p acc. + * + * @param acc The accumulator lane. + * @param input The stripe of input to mix. + * @return The mixed accumulator lane. + */ +static xxh_u32 XXH32_round(xxh_u32 acc, xxh_u32 input) +{ + acc += input * XXH_PRIME32_2; + acc = XXH_rotl32(acc, 13); + acc *= XXH_PRIME32_1; +#if (defined(__SSE4_1__) || defined(__aarch64__) || defined(__wasm_simd128__)) && !defined(XXH_ENABLE_AUTOVECTORIZE) + /* + * UGLY HACK: + * A compiler fence is the only thing that prevents GCC and Clang from + * autovectorizing the XXH32 loop (pragmas and attributes don't work for some + * reason) without globally disabling SSE4.1. + * + * The reason we want to avoid vectorization is because despite working on + * 4 integers at a time, there are multiple factors slowing XXH32 down on + * SSE4: + * - There's a ridiculous amount of lag from pmulld (10 cycles of latency on + * newer chips!) making it slightly slower to multiply four integers at + * once compared to four integers independently. Even when pmulld was + * fastest, Sandy/Ivy Bridge, it is still not worth it to go into SSE + * just to multiply unless doing a long operation. + * + * - Four instructions are required to rotate, + * movqda tmp, v // not required with VEX encoding + * pslld tmp, 13 // tmp <<= 13 + * psrld v, 19 // x >>= 19 + * por v, tmp // x |= tmp + * compared to one for scalar: + * roll v, 13 // reliably fast across the board + * shldl v, v, 13 // Sandy Bridge and later prefer this for some reason + * + * - Instruction level parallelism is actually more beneficial here because + * the SIMD actually serializes this operation: While v1 is rotating, v2 + * can load data, while v3 can multiply. SSE forces them to operate + * together. + * + * This is also enabled on AArch64, as Clang is *very aggressive* in vectorizing + * the loop. NEON is only faster on the A53, and with the newer cores, it is less + * than half the speed. + * + * Additionally, this is used on WASM SIMD128 because it JITs to the same + * SIMD instructions and has the same issue. + */ + XXH_COMPILER_GUARD(acc); +#endif + return acc; +} + +/*! + * @internal + * @brief Mixes all bits to finalize the hash. + * + * The final mix ensures that all input bits have a chance to impact any bit in + * the output digest, resulting in an unbiased distribution. + * + * @param hash The hash to avalanche. + * @return The avalanched hash. + */ +static xxh_u32 XXH32_avalanche(xxh_u32 hash) +{ + hash ^= hash >> 15; + hash *= XXH_PRIME32_2; + hash ^= hash >> 13; + hash *= XXH_PRIME32_3; + hash ^= hash >> 16; + return hash; +} + +#define XXH_get32bits(p) XXH_readLE32_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-15 bytes of @p ptr. + * + * There may be up to 15 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 16. + * @param align Whether @p ptr is aligned. + * @return The finalized hash. + * @see XXH64_finalize(). + */ +static XXH_PUREF xxh_u32 +XXH32_finalize(xxh_u32 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ +#define XXH_PROCESS1 do { \ + hash += (*ptr++) * XXH_PRIME32_5; \ + hash = XXH_rotl32(hash, 11) * XXH_PRIME32_1; \ +} while (0) + +#define XXH_PROCESS4 do { \ + hash += XXH_get32bits(ptr) * XXH_PRIME32_3; \ + ptr += 4; \ + hash = XXH_rotl32(hash, 17) * XXH_PRIME32_4; \ +} while (0) + + if (ptr==NULL) XXH_ASSERT(len == 0); + + /* Compact rerolled version; generally faster */ + if (!XXH32_ENDJMP) { + len &= 15; + while (len >= 4) { + XXH_PROCESS4; + len -= 4; + } + while (len > 0) { + XXH_PROCESS1; + --len; + } + return XXH32_avalanche(hash); + } else { + switch(len&15) /* or switch(bEnd - p) */ { + case 12: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 8: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 4: XXH_PROCESS4; + return XXH32_avalanche(hash); + + case 13: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 9: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 5: XXH_PROCESS4; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 14: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 10: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 6: XXH_PROCESS4; + XXH_PROCESS1; + XXH_PROCESS1; + return XXH32_avalanche(hash); + + case 15: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 11: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 7: XXH_PROCESS4; + XXH_FALLTHROUGH; /* fallthrough */ + case 3: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 2: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 1: XXH_PROCESS1; + XXH_FALLTHROUGH; /* fallthrough */ + case 0: return XXH32_avalanche(hash); + } + XXH_ASSERT(0); + return hash; /* reaching this point is deemed impossible */ + } +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1 XXH_PROCESS1 +# define PROCESS4 XXH_PROCESS4 +#else +# undef XXH_PROCESS1 +# undef XXH_PROCESS4 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH32(). + * + * @param input , len , seed Directly passed from @ref XXH32(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u32 +XXH32_endian_align(const xxh_u8* input, size_t len, xxh_u32 seed, XXH_alignment align) +{ + xxh_u32 h32; + + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=16) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 15; + xxh_u32 v1 = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + xxh_u32 v2 = seed + XXH_PRIME32_2; + xxh_u32 v3 = seed + 0; + xxh_u32 v4 = seed - XXH_PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(input)); input += 4; + v2 = XXH32_round(v2, XXH_get32bits(input)); input += 4; + v3 = XXH32_round(v3, XXH_get32bits(input)); input += 4; + v4 = XXH32_round(v4, XXH_get32bits(input)); input += 4; + } while (input < limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + XXH_PRIME32_5; + } + + h32 += (xxh_u32)len; + + return XXH32_finalize(h32, input, len&15, align); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t len, XXH32_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_state_t state; + XXH32_reset(&state, seed); + XXH32_update(&state, (const xxh_u8*)input, len); + return XXH32_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH32_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); +#endif +} + + + +/******* Hash streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, XXH32_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME32_1 + XXH_PRIME32_2; + statePtr->v[1] = seed + XXH_PRIME32_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME32_1; + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH_errorcode +XXH32_update(XXH32_state_t* state, const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len_32 += (XXH32_hash_t)len; + state->large_len |= (XXH32_hash_t)((len>=16) | (state->total_len_32>=16)); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, len); + state->memsize += (XXH32_hash_t)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((xxh_u8*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const xxh_u32* p32 = state->mem32; + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p32)); p32++; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p32)); p32++; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p32)); p32++; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p32)); + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const xxh_u8* const limit = bEnd - 16; + + do { + state->v[0] = XXH32_round(state->v[0], XXH_readLE32(p)); p+=4; + state->v[1] = XXH32_round(state->v[1], XXH_readLE32(p)); p+=4; + state->v[2] = XXH32_round(state->v[2], XXH_readLE32(p)); p+=4; + state->v[3] = XXH32_round(state->v[3], XXH_readLE32(p)); p+=4; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_digest(const XXH32_state_t* state) +{ + xxh_u32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v[0], 1) + + XXH_rotl32(state->v[1], 7) + + XXH_rotl32(state->v[2], 12) + + XXH_rotl32(state->v[3], 18); + } else { + h32 = state->v[2] /* == seed */ + XXH_PRIME32_5; + } + + h32 += state->total_len_32; + + return XXH32_finalize(h32, (const xxh_u8*)state->mem32, state->memsize, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! + * @ingroup XXH32_family + * The default return values from XXH functions are unsigned 32 and 64 bit + * integers. + * + * The canonical representation uses big endian convention, the same convention + * as human-readable numbers (large digits first). + * + * This way, hash values can be written into a file or buffer, remaining + * comparable across different systems. + * + * The following functions allow transformation of hash values to and from their + * canonical format. + */ +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} +/*! @ingroup XXH32_family */ +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + + +#ifndef XXH_NO_LONG_LONG + +/* ******************************************************************* +* 64-bit hash functions +*********************************************************************/ +/*! + * @} + * @ingroup impl + * @{ + */ +/******* Memory access *******/ + +typedef XXH64_hash_t xxh_u64; + +#ifdef XXH_OLD_NAMES +# define U64 xxh_u64 +#endif + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) +/* + * Manual byteshift. Best for old compilers which don't inline memcpy. + * We actually directly use XXH_readLE64 and XXH_readBE64. + */ +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + return *(const xxh_u64*) memPtr; +} + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* + * __attribute__((aligned(1))) is supported by gcc and clang. Originally the + * documentation claimed that it only increased the alignment, but actually it + * can decrease it on gcc, clang, and icc: + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=69502, + * https://gcc.godbolt.org/z/xYez1j67Y. + */ +#ifdef XXH_OLD_NAMES +typedef union { xxh_u32 u32; xxh_u64 u64; } __attribute__((packed)) unalign64; +#endif +static xxh_u64 XXH_read64(const void* ptr) +{ + typedef __attribute__((aligned(1))) xxh_u64 xxh_unalign64; + return *((const xxh_unalign64*)ptr); +} + +#else + +/* + * Portable and safe solution. Generally efficient. + * see: https://fastcompression.blogspot.com/2015/08/accessing-unaligned-memory.html + */ +static xxh_u64 XXH_read64(const void* memPtr) +{ + xxh_u64 val; + XXH_memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap64 _byteswap_uint64 +#elif XXH_GCC_VERSION >= 403 +# define XXH_swap64 __builtin_bswap64 +#else +static xxh_u64 XXH_swap64(xxh_u64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* XXH_FORCE_MEMORY_ACCESS==3 is an endian-independent byteshift load. */ +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==3)) + +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[0] + | ((xxh_u64)bytePtr[1] << 8) + | ((xxh_u64)bytePtr[2] << 16) + | ((xxh_u64)bytePtr[3] << 24) + | ((xxh_u64)bytePtr[4] << 32) + | ((xxh_u64)bytePtr[5] << 40) + | ((xxh_u64)bytePtr[6] << 48) + | ((xxh_u64)bytePtr[7] << 56); +} + +XXH_FORCE_INLINE xxh_u64 XXH_readBE64(const void* memPtr) +{ + const xxh_u8* bytePtr = (const xxh_u8 *)memPtr; + return bytePtr[7] + | ((xxh_u64)bytePtr[6] << 8) + | ((xxh_u64)bytePtr[5] << 16) + | ((xxh_u64)bytePtr[4] << 24) + | ((xxh_u64)bytePtr[3] << 32) + | ((xxh_u64)bytePtr[2] << 40) + | ((xxh_u64)bytePtr[1] << 48) + | ((xxh_u64)bytePtr[0] << 56); +} + +#else +XXH_FORCE_INLINE xxh_u64 XXH_readLE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); +} + +static xxh_u64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} +#endif + +XXH_FORCE_INLINE xxh_u64 +XXH_readLE64_align(const void* ptr, XXH_alignment align) +{ + if (align==XXH_unaligned) + return XXH_readLE64(ptr); + else + return XXH_CPU_LITTLE_ENDIAN ? *(const xxh_u64*)ptr : XXH_swap64(*(const xxh_u64*)ptr); +} + + +/******* xxh64 *******/ +/*! + * @} + * @defgroup XXH64_impl XXH64 implementation + * @ingroup impl + * + * Details on the XXH64 implementation. + * @{ + */ +/* #define rather that static const, to be used as initializers */ +#define XXH_PRIME64_1 0x9E3779B185EBCA87ULL /*!< 0b1001111000110111011110011011000110000101111010111100101010000111 */ +#define XXH_PRIME64_2 0xC2B2AE3D27D4EB4FULL /*!< 0b1100001010110010101011100011110100100111110101001110101101001111 */ +#define XXH_PRIME64_3 0x165667B19E3779F9ULL /*!< 0b0001011001010110011001111011000110011110001101110111100111111001 */ +#define XXH_PRIME64_4 0x85EBCA77C2B2AE63ULL /*!< 0b1000010111101011110010100111011111000010101100101010111001100011 */ +#define XXH_PRIME64_5 0x27D4EB2F165667C5ULL /*!< 0b0010011111010100111010110010111100010110010101100110011111000101 */ + +#ifdef XXH_OLD_NAMES +# define PRIME64_1 XXH_PRIME64_1 +# define PRIME64_2 XXH_PRIME64_2 +# define PRIME64_3 XXH_PRIME64_3 +# define PRIME64_4 XXH_PRIME64_4 +# define PRIME64_5 XXH_PRIME64_5 +#endif + +/*! @copydoc XXH32_round */ +static xxh_u64 XXH64_round(xxh_u64 acc, xxh_u64 input) +{ + acc += input * XXH_PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= XXH_PRIME64_1; + return acc; +} + +static xxh_u64 XXH64_mergeRound(xxh_u64 acc, xxh_u64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * XXH_PRIME64_1 + XXH_PRIME64_4; + return acc; +} + +/*! @copydoc XXH32_avalanche */ +static xxh_u64 XXH64_avalanche(xxh_u64 hash) +{ + hash ^= hash >> 33; + hash *= XXH_PRIME64_2; + hash ^= hash >> 29; + hash *= XXH_PRIME64_3; + hash ^= hash >> 32; + return hash; +} + + +#define XXH_get64bits(p) XXH_readLE64_align(p, align) + +/*! + * @internal + * @brief Processes the last 0-31 bytes of @p ptr. + * + * There may be up to 31 bytes remaining to consume from the input. + * This final stage will digest them to ensure that all input bytes are present + * in the final mix. + * + * @param hash The hash to finalize. + * @param ptr The pointer to the remaining input. + * @param len The remaining length, modulo 32. + * @param align Whether @p ptr is aligned. + * @return The finalized hash + * @see XXH32_finalize(). + */ +static XXH_PUREF xxh_u64 +XXH64_finalize(xxh_u64 hash, const xxh_u8* ptr, size_t len, XXH_alignment align) +{ + if (ptr==NULL) XXH_ASSERT(len == 0); + len &= 31; + while (len >= 8) { + xxh_u64 const k1 = XXH64_round(0, XXH_get64bits(ptr)); + ptr += 8; + hash ^= k1; + hash = XXH_rotl64(hash,27) * XXH_PRIME64_1 + XXH_PRIME64_4; + len -= 8; + } + if (len >= 4) { + hash ^= (xxh_u64)(XXH_get32bits(ptr)) * XXH_PRIME64_1; + ptr += 4; + hash = XXH_rotl64(hash, 23) * XXH_PRIME64_2 + XXH_PRIME64_3; + len -= 4; + } + while (len > 0) { + hash ^= (*ptr++) * XXH_PRIME64_5; + hash = XXH_rotl64(hash, 11) * XXH_PRIME64_1; + --len; + } + return XXH64_avalanche(hash); +} + +#ifdef XXH_OLD_NAMES +# define PROCESS1_64 XXH_PROCESS1_64 +# define PROCESS4_64 XXH_PROCESS4_64 +# define PROCESS8_64 XXH_PROCESS8_64 +#else +# undef XXH_PROCESS1_64 +# undef XXH_PROCESS4_64 +# undef XXH_PROCESS8_64 +#endif + +/*! + * @internal + * @brief The implementation for @ref XXH64(). + * + * @param input , len , seed Directly passed from @ref XXH64(). + * @param align Whether @p input is aligned. + * @return The calculated hash. + */ +XXH_FORCE_INLINE XXH_PUREF xxh_u64 +XXH64_endian_align(const xxh_u8* input, size_t len, xxh_u64 seed, XXH_alignment align) +{ + xxh_u64 h64; + if (input==NULL) XXH_ASSERT(len == 0); + + if (len>=32) { + const xxh_u8* const bEnd = input + len; + const xxh_u8* const limit = bEnd - 31; + xxh_u64 v1 = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + xxh_u64 v2 = seed + XXH_PRIME64_2; + xxh_u64 v3 = seed + 0; + xxh_u64 v4 = seed - XXH_PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(input)); input+=8; + v2 = XXH64_round(v2, XXH_get64bits(input)); input+=8; + v3 = XXH64_round(v3, XXH_get64bits(input)); input+=8; + v4 = XXH64_round(v4, XXH_get64bits(input)); input+=8; + } while (input<limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + XXH_PRIME64_5; + } + + h64 += (xxh_u64) len; + + return XXH64_finalize(h64, input, len, align); +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64 (XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ +#if !defined(XXH_NO_STREAM) && XXH_SIZE_OPT >= 2 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_state_t state; + XXH64_reset(&state, seed); + XXH64_update(&state, (const xxh_u8*)input, len); + return XXH64_digest(&state); +#else + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_aligned); + } } + + return XXH64_endian_align((const xxh_u8*)input, len, seed, XXH_unaligned); + +#endif +} + +/******* Hash Streaming *******/ +#ifndef XXH_NO_STREAM +/*! @ingroup XXH64_family*/ +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_copyState(XXH_NOESCAPE XXH64_state_t* dstState, const XXH64_state_t* srcState) +{ + XXH_memcpy(dstState, srcState, sizeof(*dstState)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH_NOESCAPE XXH64_state_t* statePtr, XXH64_hash_t seed) +{ + XXH_ASSERT(statePtr != NULL); + memset(statePtr, 0, sizeof(*statePtr)); + statePtr->v[0] = seed + XXH_PRIME64_1 + XXH_PRIME64_2; + statePtr->v[1] = seed + XXH_PRIME64_2; + statePtr->v[2] = seed + 0; + statePtr->v[3] = seed - XXH_PRIME64_1; + return XXH_OK; +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH_errorcode +XXH64_update (XXH_NOESCAPE XXH64_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + { const xxh_u8* p = (const xxh_u8*)input; + const xxh_u8* const bEnd = p + len; + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, len); + state->memsize += (xxh_u32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((xxh_u8*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(state->mem64+0)); + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(state->mem64+1)); + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(state->mem64+2)); + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(state->mem64+3)); + p += 32 - state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const xxh_u8* const limit = bEnd - 32; + + do { + state->v[0] = XXH64_round(state->v[0], XXH_readLE64(p)); p+=8; + state->v[1] = XXH64_round(state->v[1], XXH_readLE64(p)); p+=8; + state->v[2] = XXH64_round(state->v[2], XXH_readLE64(p)); p+=8; + state->v[3] = XXH64_round(state->v[3], XXH_readLE64(p)); p+=8; + } while (p<=limit); + + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + } + + return XXH_OK; +} + + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_digest(XXH_NOESCAPE const XXH64_state_t* state) +{ + xxh_u64 h64; + + if (state->total_len >= 32) { + h64 = XXH_rotl64(state->v[0], 1) + XXH_rotl64(state->v[1], 7) + XXH_rotl64(state->v[2], 12) + XXH_rotl64(state->v[3], 18); + h64 = XXH64_mergeRound(h64, state->v[0]); + h64 = XXH64_mergeRound(h64, state->v[1]); + h64 = XXH64_mergeRound(h64, state->v[2]); + h64 = XXH64_mergeRound(h64, state->v[3]); + } else { + h64 = state->v[2] /*seed*/ + XXH_PRIME64_5; + } + + h64 += (xxh_u64) state->total_len; + + return XXH64_finalize(h64, (const xxh_u8*)state->mem64, (size_t)state->total_len, XXH_aligned); +} +#endif /* !XXH_NO_STREAM */ + +/******* Canonical representation *******/ + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH_NOESCAPE XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + XXH_memcpy(dst, &hash, sizeof(*dst)); +} + +/*! @ingroup XXH64_family */ +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(XXH_NOESCAPE const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} + +#ifndef XXH_NO_XXH3 + +/* ********************************************************************* +* XXH3 +* New generation hash designed for speed on small keys and vectorization +************************************************************************ */ +/*! + * @} + * @defgroup XXH3_impl XXH3 implementation + * @ingroup impl + * @{ + */ + +/* === Compiler specifics === */ + +#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */ +# define XXH_RESTRICT /* disable */ +#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* >= C99 */ +# define XXH_RESTRICT restrict +#elif (defined (__GNUC__) && ((__GNUC__ > 3) || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))) \ + || (defined (__clang__)) \ + || (defined (_MSC_VER) && (_MSC_VER >= 1400)) \ + || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1300)) +/* + * There are a LOT more compilers that recognize __restrict but this + * covers the major ones. + */ +# define XXH_RESTRICT __restrict +#else +# define XXH_RESTRICT /* disable */ +#endif + +#if (defined(__GNUC__) && (__GNUC__ >= 3)) \ + || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \ + || defined(__clang__) +# define XXH_likely(x) __builtin_expect(x, 1) +# define XXH_unlikely(x) __builtin_expect(x, 0) +#else +# define XXH_likely(x) (x) +# define XXH_unlikely(x) (x) +#endif + +#ifndef XXH_HAS_INCLUDE +# ifdef __has_include +# define XXH_HAS_INCLUDE(x) __has_include(x) +# else +# define XXH_HAS_INCLUDE(x) 0 +# endif +#endif + +#if defined(__GNUC__) || defined(__clang__) +# if defined(__ARM_FEATURE_SVE) +# include <arm_sve.h> +# endif +# if defined(__ARM_NEON__) || defined(__ARM_NEON) \ + || (defined(_M_ARM) && _M_ARM >= 7) \ + || defined(_M_ARM64) || defined(_M_ARM64EC) \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* WASM SIMD128 via SIMDe */ +# define inline __inline__ /* circumvent a clang bug */ +# include <arm_neon.h> +# undef inline +# elif defined(__AVX2__) +# include <immintrin.h> +# elif defined(__SSE2__) +# include <emmintrin.h> +# endif +#endif + +#if defined(_MSC_VER) +# include <intrin.h> +#endif + +/* + * One goal of XXH3 is to make it fast on both 32-bit and 64-bit, while + * remaining a true 64-bit/128-bit hash function. + * + * This is done by prioritizing a subset of 64-bit operations that can be + * emulated without too many steps on the average 32-bit machine. + * + * For example, these two lines seem similar, and run equally fast on 64-bit: + * + * xxh_u64 x; + * x ^= (x >> 47); // good + * x ^= (x >> 13); // bad + * + * However, to a 32-bit machine, there is a major difference. + * + * x ^= (x >> 47) looks like this: + * + * x.lo ^= (x.hi >> (47 - 32)); + * + * while x ^= (x >> 13) looks like this: + * + * // note: funnel shifts are not usually cheap. + * x.lo ^= (x.lo >> 13) | (x.hi << (32 - 13)); + * x.hi ^= (x.hi >> 13); + * + * The first one is significantly faster than the second, simply because the + * shift is larger than 32. This means: + * - All the bits we need are in the upper 32 bits, so we can ignore the lower + * 32 bits in the shift. + * - The shift result will always fit in the lower 32 bits, and therefore, + * we can ignore the upper 32 bits in the xor. + * + * Thanks to this optimization, XXH3 only requires these features to be efficient: + * + * - Usable unaligned access + * - A 32-bit or 64-bit ALU + * - If 32-bit, a decent ADC instruction + * - A 32 or 64-bit multiply with a 64-bit result + * - For the 128-bit variant, a decent byteswap helps short inputs. + * + * The first two are already required by XXH32, and almost all 32-bit and 64-bit + * platforms which can run XXH32 can run XXH3 efficiently. + * + * Thumb-1, the classic 16-bit only subset of ARM's instruction set, is one + * notable exception. + * + * First of all, Thumb-1 lacks support for the UMULL instruction which + * performs the important long multiply. This means numerous __aeabi_lmul + * calls. + * + * Second of all, the 8 functional registers are just not enough. + * Setup for __aeabi_lmul, byteshift loads, pointers, and all arithmetic need + * Lo registers, and this shuffling results in thousands more MOVs than A32. + * + * A32 and T32 don't have this limitation. They can access all 14 registers, + * do a 32->64 multiply with UMULL, and the flexible operand allowing free + * shifts is helpful, too. + * + * Therefore, we do a quick sanity check. + * + * If compiling Thumb-1 for a target which supports ARM instructions, we will + * emit a warning, as it is not a "sane" platform to compile for. + * + * Usually, if this happens, it is because of an accident and you probably need + * to specify -march, as you likely meant to compile for a newer architecture. + * + * Credit: large sections of the vectorial and asm source code paths + * have been contributed by @easyaspi314 + */ +#if defined(__thumb__) && !defined(__thumb2__) && defined(__ARM_ARCH_ISA_ARM) +# warning "XXH3 is highly inefficient without ARM or Thumb-2." +#endif + +/* ========================================== + * Vectorization detection + * ========================================== */ + +#ifdef XXH_DOXYGEN +/*! + * @ingroup tuning + * @brief Overrides the vectorization implementation chosen for XXH3. + * + * Can be defined to 0 to disable SIMD or any of the values mentioned in + * @ref XXH_VECTOR_TYPE. + * + * If this is not defined, it uses predefined macros to determine the best + * implementation. + */ +# define XXH_VECTOR XXH_SCALAR +/*! + * @ingroup tuning + * @brief Possible values for @ref XXH_VECTOR. + * + * Note that these are actually implemented as macros. + * + * If this is not defined, it is detected automatically. + * internal macro XXH_X86DISPATCH overrides this. + */ +enum XXH_VECTOR_TYPE /* fake enum */ { + XXH_SCALAR = 0, /*!< Portable scalar version */ + XXH_SSE2 = 1, /*!< + * SSE2 for Pentium 4, Opteron, all x86_64. + * + * @note SSE2 is also guaranteed on Windows 10, macOS, and + * Android x86. + */ + XXH_AVX2 = 2, /*!< AVX2 for Haswell and Bulldozer */ + XXH_AVX512 = 3, /*!< AVX512 for Skylake and Icelake */ + XXH_NEON = 4, /*!< + * NEON for most ARMv7-A, all AArch64, and WASM SIMD128 + * via the SIMDeverywhere polyfill provided with the + * Emscripten SDK. + */ + XXH_VSX = 5, /*!< VSX and ZVector for POWER8/z13 (64-bit) */ + XXH_SVE = 6, /*!< SVE for some ARMv8-A and ARMv9-A */ +}; +/*! + * @ingroup tuning + * @brief Selects the minimum alignment for XXH3's accumulators. + * + * When using SIMD, this should match the alignment required for said vector + * type, so, for example, 32 for AVX2. + * + * Default: Auto detected. + */ +# define XXH_ACC_ALIGN 8 +#endif + +/* Actual definition */ +#ifndef XXH_DOXYGEN +# define XXH_SCALAR 0 +# define XXH_SSE2 1 +# define XXH_AVX2 2 +# define XXH_AVX512 3 +# define XXH_NEON 4 +# define XXH_VSX 5 +# define XXH_SVE 6 +#endif + +#ifndef XXH_VECTOR /* can be defined on command line */ +# if defined(__ARM_FEATURE_SVE) +# define XXH_VECTOR XXH_SVE +# elif ( \ + defined(__ARM_NEON__) || defined(__ARM_NEON) /* gcc */ \ + || defined(_M_ARM) || defined(_M_ARM64) || defined(_M_ARM64EC) /* msvc */ \ + || (defined(__wasm_simd128__) && XXH_HAS_INCLUDE(<arm_neon.h>)) /* wasm simd128 via SIMDe */ \ + ) && ( \ + defined(_WIN32) || defined(__LITTLE_ENDIAN__) /* little endian only */ \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ + ) +# define XXH_VECTOR XXH_NEON +# elif defined(__AVX512F__) +# define XXH_VECTOR XXH_AVX512 +# elif defined(__AVX2__) +# define XXH_VECTOR XXH_AVX2 +# elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2)) +# define XXH_VECTOR XXH_SSE2 +# elif (defined(__PPC64__) && defined(__POWER8_VECTOR__)) \ + || (defined(__s390x__) && defined(__VEC__)) \ + && defined(__GNUC__) /* TODO: IBM XL */ +# define XXH_VECTOR XXH_VSX +# else +# define XXH_VECTOR XXH_SCALAR +# endif +#endif + +/* __ARM_FEATURE_SVE is only supported by GCC & Clang. */ +#if (XXH_VECTOR == XXH_SVE) && !defined(__ARM_FEATURE_SVE) +# ifdef _MSC_VER +# pragma warning(once : 4606) +# else +# warning "__ARM_FEATURE_SVE isn't supported. Use SCALAR instead." +# endif +# undef XXH_VECTOR +# define XXH_VECTOR XXH_SCALAR +#endif + +/* + * Controls the alignment of the accumulator, + * for compatibility with aligned vector loads, which are usually faster. + */ +#ifndef XXH_ACC_ALIGN +# if defined(XXH_X86DISPATCH) +# define XXH_ACC_ALIGN 64 /* for compatibility with avx512 */ +# elif XXH_VECTOR == XXH_SCALAR /* scalar */ +# define XXH_ACC_ALIGN 8 +# elif XXH_VECTOR == XXH_SSE2 /* sse2 */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX2 /* avx2 */ +# define XXH_ACC_ALIGN 32 +# elif XXH_VECTOR == XXH_NEON /* neon */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_VSX /* vsx */ +# define XXH_ACC_ALIGN 16 +# elif XXH_VECTOR == XXH_AVX512 /* avx512 */ +# define XXH_ACC_ALIGN 64 +# elif XXH_VECTOR == XXH_SVE /* sve */ +# define XXH_ACC_ALIGN 64 +# endif +#endif + +#if defined(XXH_X86DISPATCH) || XXH_VECTOR == XXH_SSE2 \ + || XXH_VECTOR == XXH_AVX2 || XXH_VECTOR == XXH_AVX512 +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#elif XXH_VECTOR == XXH_SVE +# define XXH_SEC_ALIGN XXH_ACC_ALIGN +#else +# define XXH_SEC_ALIGN 8 +#endif + +#if defined(__GNUC__) || defined(__clang__) +# define XXH_ALIASING __attribute__((may_alias)) +#else +# define XXH_ALIASING /* nothing */ +#endif + +/* + * UGLY HACK: + * GCC usually generates the best code with -O3 for xxHash. + * + * However, when targeting AVX2, it is overzealous in its unrolling resulting + * in code roughly 3/4 the speed of Clang. + * + * There are other issues, such as GCC splitting _mm256_loadu_si256 into + * _mm_loadu_si128 + _mm256_inserti128_si256. This is an optimization which + * only applies to Sandy and Ivy Bridge... which don't even support AVX2. + * + * That is why when compiling the AVX2 version, it is recommended to use either + * -O2 -mavx2 -march=haswell + * or + * -O2 -mavx2 -mno-avx256-split-unaligned-load + * for decent performance, or to use Clang instead. + * + * Fortunately, we can control the first one with a pragma that forces GCC into + * -O2, but the other one we can't control without "failed to inline always + * inline function due to target mismatch" warnings. + */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC push_options +# pragma GCC optimize("-O2") +#endif + +#if XXH_VECTOR == XXH_NEON + +/* + * UGLY HACK: While AArch64 GCC on Linux does not seem to care, on macOS, GCC -O3 + * optimizes out the entire hashLong loop because of the aliasing violation. + * + * However, GCC is also inefficient at load-store optimization with vld1q/vst1q, + * so the only option is to mark it as aliasing. + */ +typedef uint64x2_t xxh_aliasing_uint64x2_t XXH_ALIASING; + +/*! + * @internal + * @brief `vld1q_u64` but faster and alignment-safe. + * + * On AArch64, unaligned access is always safe, but on ARMv7-a, it is only + * *conditionally* safe (`vld1` has an alignment bit like `movdq[ua]` in x86). + * + * GCC for AArch64 sees `vld1q_u8` as an intrinsic instead of a load, so it + * prohibits load-store optimizations. Therefore, a direct dereference is used. + * + * Otherwise, `vld1q_u8` is used with `vreinterpretq_u8_u64` to do a safe + * unaligned load. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) /* silence -Wcast-align */ +{ + return *(xxh_aliasing_uint64x2_t const *)ptr; +} +#else +XXH_FORCE_INLINE uint64x2_t XXH_vld1q_u64(void const* ptr) +{ + return vreinterpretq_u64_u8(vld1q_u8((uint8_t const*)ptr)); +} +#endif + +/*! + * @internal + * @brief `vmlal_u32` on low and high halves of a vector. + * + * This is a workaround for AArch64 GCC < 11 which implemented arm_neon.h with + * inline assembly and were therefore incapable of merging the `vget_{low, high}_u32` + * with `vmlal_u32`. + */ +#if defined(__aarch64__) && defined(__GNUC__) && !defined(__clang__) && __GNUC__ < 11 +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* Inline assembly is the only way */ + __asm__("umlal %0.2d, %1.2s, %2.2s" : "+w" (acc) : "w" (lhs), "w" (rhs)); + return acc; +} +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + /* This intrinsic works as expected */ + return vmlal_high_u32(acc, lhs, rhs); +} +#else +/* Portable intrinsic versions */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_low_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_low_u32(lhs), vget_low_u32(rhs)); +} +/*! @copydoc XXH_vmlal_low_u32 + * Assume the compiler converts this to vmlal_high_u32 on aarch64 */ +XXH_FORCE_INLINE uint64x2_t +XXH_vmlal_high_u32(uint64x2_t acc, uint32x4_t lhs, uint32x4_t rhs) +{ + return vmlal_u32(acc, vget_high_u32(lhs), vget_high_u32(rhs)); +} +#endif + +/*! + * @ingroup tuning + * @brief Controls the NEON to scalar ratio for XXH3 + * + * This can be set to 2, 4, 6, or 8. + * + * ARM Cortex CPUs are _very_ sensitive to how their pipelines are used. + * + * For example, the Cortex-A73 can dispatch 3 micro-ops per cycle, but only 2 of those + * can be NEON. If you are only using NEON instructions, you are only using 2/3 of the CPU + * bandwidth. + * + * This is even more noticeable on the more advanced cores like the Cortex-A76 which + * can dispatch 8 micro-ops per cycle, but still only 2 NEON micro-ops at once. + * + * Therefore, to make the most out of the pipeline, it is beneficial to run 6 NEON lanes + * and 2 scalar lanes, which is chosen by default. + * + * This does not apply to Apple processors or 32-bit processors, which run better with + * full NEON. These will default to 8. Additionally, size-optimized builds run 8 lanes. + * + * This change benefits CPUs with large micro-op buffers without negatively affecting + * most other CPUs: + * + * | Chipset | Dispatch type | NEON only | 6:2 hybrid | Diff. | + * |:----------------------|:--------------------|----------:|-----------:|------:| + * | Snapdragon 730 (A76) | 2 NEON/8 micro-ops | 8.8 GB/s | 10.1 GB/s | ~16% | + * | Snapdragon 835 (A73) | 2 NEON/3 micro-ops | 5.1 GB/s | 5.3 GB/s | ~5% | + * | Marvell PXA1928 (A53) | In-order dual-issue | 1.9 GB/s | 1.9 GB/s | 0% | + * | Apple M1 | 4 NEON/8 micro-ops | 37.3 GB/s | 36.1 GB/s | ~-3% | + * + * It also seems to fix some bad codegen on GCC, making it almost as fast as clang. + * + * When using WASM SIMD128, if this is 2 or 6, SIMDe will scalarize 2 of the lanes meaning + * it effectively becomes worse 4. + * + * @see XXH3_accumulate_512_neon() + */ +# ifndef XXH3_NEON_LANES +# if (defined(__aarch64__) || defined(__arm64__) || defined(_M_ARM64) || defined(_M_ARM64EC)) \ + && !defined(__APPLE__) && XXH_SIZE_OPT <= 0 +# define XXH3_NEON_LANES 6 +# else +# define XXH3_NEON_LANES XXH_ACC_NB +# endif +# endif +#endif /* XXH_VECTOR == XXH_NEON */ + +/* + * VSX and Z Vector helpers. + * + * This is very messy, and any pull requests to clean this up are welcome. + * + * There are a lot of problems with supporting VSX and s390x, due to + * inconsistent intrinsics, spotty coverage, and multiple endiannesses. + */ +#if XXH_VECTOR == XXH_VSX +/* Annoyingly, these headers _may_ define three macros: `bool`, `vector`, + * and `pixel`. This is a problem for obvious reasons. + * + * These keywords are unnecessary; the spec literally says they are + * equivalent to `__bool`, `__vector`, and `__pixel` and may be undef'd + * after including the header. + * + * We use pragma push_macro/pop_macro to keep the namespace clean. */ +# pragma push_macro("bool") +# pragma push_macro("vector") +# pragma push_macro("pixel") +/* silence potential macro redefined warnings */ +# undef bool +# undef vector +# undef pixel + +# if defined(__s390x__) +# include <s390intrin.h> +# else +# include <altivec.h> +# endif + +/* Restore the original macro values, if applicable. */ +# pragma pop_macro("pixel") +# pragma pop_macro("vector") +# pragma pop_macro("bool") + +typedef __vector unsigned long long xxh_u64x2; +typedef __vector unsigned char xxh_u8x16; +typedef __vector unsigned xxh_u32x4; + +/* + * UGLY HACK: Similar to aarch64 macOS GCC, s390x GCC has the same aliasing issue. + */ +typedef xxh_u64x2 xxh_aliasing_u64x2 XXH_ALIASING; + +# ifndef XXH_VSX_BE +# if defined(__BIG_ENDIAN__) \ + || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +# define XXH_VSX_BE 1 +# elif defined(__VEC_ELEMENT_REG_ORDER__) && __VEC_ELEMENT_REG_ORDER__ == __ORDER_BIG_ENDIAN__ +# warning "-maltivec=be is not recommended. Please use native endianness." +# define XXH_VSX_BE 1 +# else +# define XXH_VSX_BE 0 +# endif +# endif /* !defined(XXH_VSX_BE) */ + +# if XXH_VSX_BE +# if defined(__POWER9_VECTOR__) || (defined(__clang__) && defined(__s390x__)) +# define XXH_vec_revb vec_revb +# else +/*! + * A polyfill for POWER9's vec_revb(). + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_revb(xxh_u64x2 val) +{ + xxh_u8x16 const vByteSwap = { 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00, + 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08 }; + return vec_perm(val, val, vByteSwap); +} +# endif +# endif /* XXH_VSX_BE */ + +/*! + * Performs an unaligned vector load and byte swaps it on big endian. + */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_loadu(const void *ptr) +{ + xxh_u64x2 ret; + XXH_memcpy(&ret, ptr, sizeof(xxh_u64x2)); +# if XXH_VSX_BE + ret = XXH_vec_revb(ret); +# endif + return ret; +} + +/* + * vec_mulo and vec_mule are very problematic intrinsics on PowerPC + * + * These intrinsics weren't added until GCC 8, despite existing for a while, + * and they are endian dependent. Also, their meaning swap depending on version. + * */ +# if defined(__s390x__) + /* s390x is always big endian, no issue on this platform */ +# define XXH_vec_mulo vec_mulo +# define XXH_vec_mule vec_mule +# elif defined(__clang__) && XXH_HAS_BUILTIN(__builtin_altivec_vmuleuw) && !defined(__ibmxl__) +/* Clang has a better way to control this, we can just use the builtin which doesn't swap. */ + /* The IBM XL Compiler (which defined __clang__) only implements the vec_* operations */ +# define XXH_vec_mulo __builtin_altivec_vmulouw +# define XXH_vec_mule __builtin_altivec_vmuleuw +# else +/* gcc needs inline assembly */ +/* Adapted from https://github.com/google/highwayhash/blob/master/highwayhash/hh_vsx.h. */ +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mulo(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmulouw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +XXH_FORCE_INLINE xxh_u64x2 XXH_vec_mule(xxh_u32x4 a, xxh_u32x4 b) +{ + xxh_u64x2 result; + __asm__("vmuleuw %0, %1, %2" : "=v" (result) : "v" (a), "v" (b)); + return result; +} +# endif /* XXH_vec_mulo, XXH_vec_mule */ +#endif /* XXH_VECTOR == XXH_VSX */ + +#if XXH_VECTOR == XXH_SVE +#define ACCRND(acc, offset) \ +do { \ + svuint64_t input_vec = svld1_u64(mask, xinput + offset); \ + svuint64_t secret_vec = svld1_u64(mask, xsecret + offset); \ + svuint64_t mixed = sveor_u64_x(mask, secret_vec, input_vec); \ + svuint64_t swapped = svtbl_u64(input_vec, kSwap); \ + svuint64_t mixed_lo = svextw_u64_x(mask, mixed); \ + svuint64_t mixed_hi = svlsr_n_u64_x(mask, mixed, 32); \ + svuint64_t mul = svmad_u64_x(mask, mixed_lo, mixed_hi, swapped); \ + acc = svadd_u64_x(mask, acc, mul); \ +} while (0) +#endif /* XXH_VECTOR == XXH_SVE */ + +/* prefetch + * can be disabled, by declaring XXH_NO_PREFETCH build macro */ +#if defined(XXH_NO_PREFETCH) +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +#else +# if XXH_SIZE_OPT >= 1 +# define XXH_PREFETCH(ptr) (void)(ptr) +# elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) /* _mm_prefetch() not defined outside of x86/x64 */ +# include <mmintrin.h> /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define XXH_PREFETCH(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define XXH_PREFETCH(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# else +# define XXH_PREFETCH(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* XXH_NO_PREFETCH */ + + +/* ========================================== + * XXH3 default settings + * ========================================== */ + +#define XXH_SECRET_DEFAULT_SIZE 192 /* minimum XXH3_SECRET_SIZE_MIN */ + +#if (XXH_SECRET_DEFAULT_SIZE < XXH3_SECRET_SIZE_MIN) +# error "default keyset is not large enough" +#endif + +/*! Pseudorandom secret taken directly from FARSH. */ +XXH_ALIGN(64) static const xxh_u8 XXH3_kSecret[XXH_SECRET_DEFAULT_SIZE] = { + 0xb8, 0xfe, 0x6c, 0x39, 0x23, 0xa4, 0x4b, 0xbe, 0x7c, 0x01, 0x81, 0x2c, 0xf7, 0x21, 0xad, 0x1c, + 0xde, 0xd4, 0x6d, 0xe9, 0x83, 0x90, 0x97, 0xdb, 0x72, 0x40, 0xa4, 0xa4, 0xb7, 0xb3, 0x67, 0x1f, + 0xcb, 0x79, 0xe6, 0x4e, 0xcc, 0xc0, 0xe5, 0x78, 0x82, 0x5a, 0xd0, 0x7d, 0xcc, 0xff, 0x72, 0x21, + 0xb8, 0x08, 0x46, 0x74, 0xf7, 0x43, 0x24, 0x8e, 0xe0, 0x35, 0x90, 0xe6, 0x81, 0x3a, 0x26, 0x4c, + 0x3c, 0x28, 0x52, 0xbb, 0x91, 0xc3, 0x00, 0xcb, 0x88, 0xd0, 0x65, 0x8b, 0x1b, 0x53, 0x2e, 0xa3, + 0x71, 0x64, 0x48, 0x97, 0xa2, 0x0d, 0xf9, 0x4e, 0x38, 0x19, 0xef, 0x46, 0xa9, 0xde, 0xac, 0xd8, + 0xa8, 0xfa, 0x76, 0x3f, 0xe3, 0x9c, 0x34, 0x3f, 0xf9, 0xdc, 0xbb, 0xc7, 0xc7, 0x0b, 0x4f, 0x1d, + 0x8a, 0x51, 0xe0, 0x4b, 0xcd, 0xb4, 0x59, 0x31, 0xc8, 0x9f, 0x7e, 0xc9, 0xd9, 0x78, 0x73, 0x64, + 0xea, 0xc5, 0xac, 0x83, 0x34, 0xd3, 0xeb, 0xc3, 0xc5, 0x81, 0xa0, 0xff, 0xfa, 0x13, 0x63, 0xeb, + 0x17, 0x0d, 0xdd, 0x51, 0xb7, 0xf0, 0xda, 0x49, 0xd3, 0x16, 0x55, 0x26, 0x29, 0xd4, 0x68, 0x9e, + 0x2b, 0x16, 0xbe, 0x58, 0x7d, 0x47, 0xa1, 0xfc, 0x8f, 0xf8, 0xb8, 0xd1, 0x7a, 0xd0, 0x31, 0xce, + 0x45, 0xcb, 0x3a, 0x8f, 0x95, 0x16, 0x04, 0x28, 0xaf, 0xd7, 0xfb, 0xca, 0xbb, 0x4b, 0x40, 0x7e, +}; + +static const xxh_u64 PRIME_MX1 = 0x165667919E3779F9ULL; /*!< 0b0001011001010110011001111001000110011110001101110111100111111001 */ +static const xxh_u64 PRIME_MX2 = 0x9FB21C651E98DF25ULL; /*!< 0b1001111110110010000111000110010100011110100110001101111100100101 */ + +#ifdef XXH_OLD_NAMES +# define kSecret XXH3_kSecret +#endif + +#ifdef XXH_DOXYGEN +/*! + * @brief Calculates a 32-bit to 64-bit long multiply. + * + * Implemented as a macro. + * + * Wraps `__emulu` on MSVC x86 because it tends to call `__allmul` when it doesn't + * need to (but it shouldn't need to anyways, it is about 7 instructions to do + * a 64x64 multiply...). Since we know that this will _always_ emit `MULL`, we + * use that instead of the normal method. + * + * If you are compiling for platforms like Thumb-1 and don't have a better option, + * you may also want to write your own long multiply routine here. + * + * @param x, y Numbers to be multiplied + * @return 64-bit product of the low 32 bits of @p x and @p y. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64(xxh_u64 x, xxh_u64 y) +{ + return (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF); +} +#elif defined(_MSC_VER) && defined(_M_IX86) +# define XXH_mult32to64(x, y) __emulu((unsigned)(x), (unsigned)(y)) +#else +/* + * Downcast + upcast is usually better than masking on older compilers like + * GCC 4.2 (especially 32-bit ones), all without affecting newer compilers. + * + * The other method, (x & 0xFFFFFFFF) * (y & 0xFFFFFFFF), will AND both operands + * and perform a full 64x64 multiply -- entirely redundant on 32-bit. + */ +# define XXH_mult32to64(x, y) ((xxh_u64)(xxh_u32)(x) * (xxh_u64)(xxh_u32)(y)) +#endif + +/*! + * @brief Calculates a 64->128-bit long multiply. + * + * Uses `__uint128_t` and `_umul128` if available, otherwise uses a scalar + * version. + * + * @param lhs , rhs The 64-bit integers to be multiplied + * @return The 128-bit result represented in an @ref XXH128_hash_t. + */ +static XXH128_hash_t +XXH_mult64to128(xxh_u64 lhs, xxh_u64 rhs) +{ + /* + * GCC/Clang __uint128_t method. + * + * On most 64-bit targets, GCC and Clang define a __uint128_t type. + * This is usually the best way as it usually uses a native long 64-bit + * multiply, such as MULQ on x86_64 or MUL + UMULH on aarch64. + * + * Usually. + * + * Despite being a 32-bit platform, Clang (and emscripten) define this type + * despite not having the arithmetic for it. This results in a laggy + * compiler builtin call which calculates a full 128-bit multiply. + * In that case it is best to use the portable one. + * https://github.com/Cyan4973/xxHash/issues/211#issuecomment-515575677 + */ +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__wasm__) \ + && defined(__SIZEOF_INT128__) \ + || (defined(_INTEGRAL_MAX_BITS) && _INTEGRAL_MAX_BITS >= 128) + + __uint128_t const product = (__uint128_t)lhs * (__uint128_t)rhs; + XXH128_hash_t r128; + r128.low64 = (xxh_u64)(product); + r128.high64 = (xxh_u64)(product >> 64); + return r128; + + /* + * MSVC for x64's _umul128 method. + * + * xxh_u64 _umul128(xxh_u64 Multiplier, xxh_u64 Multiplicand, xxh_u64 *HighProduct); + * + * This compiles to single operand MUL on x64. + */ +#elif (defined(_M_X64) || defined(_M_IA64)) && !defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(_umul128) +#endif + xxh_u64 product_high; + xxh_u64 const product_low = _umul128(lhs, rhs, &product_high); + XXH128_hash_t r128; + r128.low64 = product_low; + r128.high64 = product_high; + return r128; + + /* + * MSVC for ARM64's __umulh method. + * + * This compiles to the same MUL + UMULH as GCC/Clang's __uint128_t method. + */ +#elif defined(_M_ARM64) || defined(_M_ARM64EC) + +#ifndef _MSC_VER +# pragma intrinsic(__umulh) +#endif + XXH128_hash_t r128; + r128.low64 = lhs * rhs; + r128.high64 = __umulh(lhs, rhs); + return r128; + +#else + /* + * Portable scalar method. Optimized for 32-bit and 64-bit ALUs. + * + * This is a fast and simple grade school multiply, which is shown below + * with base 10 arithmetic instead of base 0x100000000. + * + * 9 3 // D2 lhs = 93 + * x 7 5 // D2 rhs = 75 + * ---------- + * 1 5 // D2 lo_lo = (93 % 10) * (75 % 10) = 15 + * 4 5 | // D2 hi_lo = (93 / 10) * (75 % 10) = 45 + * 2 1 | // D2 lo_hi = (93 % 10) * (75 / 10) = 21 + * + 6 3 | | // D2 hi_hi = (93 / 10) * (75 / 10) = 63 + * --------- + * 2 7 | // D2 cross = (15 / 10) + (45 % 10) + 21 = 27 + * + 6 7 | | // D2 upper = (27 / 10) + (45 / 10) + 63 = 67 + * --------- + * 6 9 7 5 // D4 res = (27 * 10) + (15 % 10) + (67 * 100) = 6975 + * + * The reasons for adding the products like this are: + * 1. It avoids manual carry tracking. Just like how + * (9 * 9) + 9 + 9 = 99, the same applies with this for UINT64_MAX. + * This avoids a lot of complexity. + * + * 2. It hints for, and on Clang, compiles to, the powerful UMAAL + * instruction available in ARM's Digital Signal Processing extension + * in 32-bit ARMv6 and later, which is shown below: + * + * void UMAAL(xxh_u32 *RdLo, xxh_u32 *RdHi, xxh_u32 Rn, xxh_u32 Rm) + * { + * xxh_u64 product = (xxh_u64)*RdLo * (xxh_u64)*RdHi + Rn + Rm; + * *RdLo = (xxh_u32)(product & 0xFFFFFFFF); + * *RdHi = (xxh_u32)(product >> 32); + * } + * + * This instruction was designed for efficient long multiplication, and + * allows this to be calculated in only 4 instructions at speeds + * comparable to some 64-bit ALUs. + * + * 3. It isn't terrible on other platforms. Usually this will be a couple + * of 32-bit ADD/ADCs. + */ + + /* First calculate all of the cross products. */ + xxh_u64 const lo_lo = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs & 0xFFFFFFFF); + xxh_u64 const hi_lo = XXH_mult32to64(lhs >> 32, rhs & 0xFFFFFFFF); + xxh_u64 const lo_hi = XXH_mult32to64(lhs & 0xFFFFFFFF, rhs >> 32); + xxh_u64 const hi_hi = XXH_mult32to64(lhs >> 32, rhs >> 32); + + /* Now add the products together. These will never overflow. */ + xxh_u64 const cross = (lo_lo >> 32) + (hi_lo & 0xFFFFFFFF) + lo_hi; + xxh_u64 const upper = (hi_lo >> 32) + (cross >> 32) + hi_hi; + xxh_u64 const lower = (cross << 32) | (lo_lo & 0xFFFFFFFF); + + XXH128_hash_t r128; + r128.low64 = lower; + r128.high64 = upper; + return r128; +#endif +} + +/*! + * @brief Calculates a 64-bit to 128-bit multiply, then XOR folds it. + * + * The reason for the separate function is to prevent passing too many structs + * around by value. This will hopefully inline the multiply, but we don't force it. + * + * @param lhs , rhs The 64-bit integers to multiply + * @return The low 64 bits of the product XOR'd by the high 64 bits. + * @see XXH_mult64to128() + */ +static xxh_u64 +XXH3_mul128_fold64(xxh_u64 lhs, xxh_u64 rhs) +{ + XXH128_hash_t product = XXH_mult64to128(lhs, rhs); + return product.low64 ^ product.high64; +} + +/*! Seems to produce slightly better code on GCC for some reason. */ +XXH_FORCE_INLINE XXH_CONSTF xxh_u64 XXH_xorshift64(xxh_u64 v64, int shift) +{ + XXH_ASSERT(0 <= shift && shift < 64); + return v64 ^ (v64 >> shift); +} + +/* + * This is a fast avalanche stage, + * suitable when input bits are already partially mixed + */ +static XXH64_hash_t XXH3_avalanche(xxh_u64 h64) +{ + h64 = XXH_xorshift64(h64, 37); + h64 *= PRIME_MX1; + h64 = XXH_xorshift64(h64, 32); + return h64; +} + +/* + * This is a stronger avalanche, + * inspired by Pelle Evensen's rrmxmx + * preferable when input has not been previously mixed + */ +static XXH64_hash_t XXH3_rrmxmx(xxh_u64 h64, xxh_u64 len) +{ + /* this mix is inspired by Pelle Evensen's rrmxmx */ + h64 ^= XXH_rotl64(h64, 49) ^ XXH_rotl64(h64, 24); + h64 *= PRIME_MX2; + h64 ^= (h64 >> 35) + len ; + h64 *= PRIME_MX2; + return XXH_xorshift64(h64, 28); +} + + +/* ========================================== + * Short keys + * ========================================== + * One of the shortcomings of XXH32 and XXH64 was that their performance was + * sub-optimal on short lengths. It used an iterative algorithm which strongly + * favored lengths that were a multiple of 4 or 8. + * + * Instead of iterating over individual inputs, we use a set of single shot + * functions which piece together a range of lengths and operate in constant time. + * + * Additionally, the number of multiplies has been significantly reduced. This + * reduces latency, especially when emulating 64-bit multiplies on 32-bit. + * + * Depending on the platform, this may or may not be faster than XXH32, but it + * is almost guaranteed to be faster than XXH64. + */ + +/* + * At very short lengths, there isn't enough input to fully hide secrets, or use + * the entire secret. + * + * There is also only a limited amount of mixing we can do before significantly + * impacting performance. + * + * Therefore, we use different sections of the secret and always mix two secret + * samples with an XOR. This should have no effect on performance on the + * seedless or withSeed variants because everything _should_ be constant folded + * by modern compilers. + * + * The XOR mixing hides individual parts of the secret and increases entropy. + * + * This adds an extra layer of strength for custom secrets. + */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_1to3_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combined = { input[0], 0x01, input[0], input[0] } + * len = 2: combined = { input[1], 0x02, input[0], input[1] } + * len = 3: combined = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combined = ((xxh_u32)c1 << 16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u64 const bitflip = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const keyed = (xxh_u64)combined ^ bitflip; + return XXH64_avalanche(keyed); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_4to8_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input1 = XXH_readLE32(input); + xxh_u32 const input2 = XXH_readLE32(input + len - 4); + xxh_u64 const bitflip = (XXH_readLE64(secret+8) ^ XXH_readLE64(secret+16)) - seed; + xxh_u64 const input64 = input2 + (((xxh_u64)input1) << 32); + xxh_u64 const keyed = input64 ^ bitflip; + return XXH3_rrmxmx(keyed, len); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_9to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflip1 = (XXH_readLE64(secret+24) ^ XXH_readLE64(secret+32)) + seed; + xxh_u64 const bitflip2 = (XXH_readLE64(secret+40) ^ XXH_readLE64(secret+48)) - seed; + xxh_u64 const input_lo = XXH_readLE64(input) ^ bitflip1; + xxh_u64 const input_hi = XXH_readLE64(input + len - 8) ^ bitflip2; + xxh_u64 const acc = len + + XXH_swap64(input_lo) + input_hi + + XXH3_mul128_fold64(input_lo, input_hi); + return XXH3_avalanche(acc); + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_0to16_64b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (XXH_likely(len > 8)) return XXH3_len_9to16_64b(input, len, secret, seed); + if (XXH_likely(len >= 4)) return XXH3_len_4to8_64b(input, len, secret, seed); + if (len) return XXH3_len_1to3_64b(input, len, secret, seed); + return XXH64_avalanche(seed ^ (XXH_readLE64(secret+56) ^ XXH_readLE64(secret+64))); + } +} + +/* + * DISCLAIMER: There are known *seed-dependent* multicollisions here due to + * multiplication by zero, affecting hashes of lengths 17 to 240. + * + * However, they are very unlikely. + * + * Keep this in mind when using the unseeded XXH3_64bits() variant: As with all + * unseeded non-cryptographic hashes, it does not attempt to defend itself + * against specially crafted inputs, only random inputs. + * + * Compared to classic UMAC where a 1 in 2^31 chance of 4 consecutive bytes + * cancelling out the secret is taken an arbitrary number of times (addressed + * in XXH3_accumulate_512), this collision is very unlikely with random inputs + * and/or proper seeding: + * + * This only has a 1 in 2^63 chance of 8 consecutive bytes cancelling out, in a + * function that is only called up to 16 times per hash with up to 240 bytes of + * input. + * + * This is not too bad for a non-cryptographic hash function, especially with + * only 64 bit outputs. + * + * The 128-bit variant (which trades some speed for strength) is NOT affected + * by this, although it is always a good idea to use a proper seed if you care + * about strength. + */ +XXH_FORCE_INLINE xxh_u64 XXH3_mix16B(const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, xxh_u64 seed64) +{ +#if defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__i386__) && defined(__SSE2__) /* x86 + SSE2 */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable like XXH32 hack */ + /* + * UGLY HACK: + * GCC for x86 tends to autovectorize the 128-bit multiply, resulting in + * slower code. + * + * By forcing seed64 into a register, we disrupt the cost model and + * cause it to scalarize. See `XXH32_round()` + * + * FIXME: Clang's output is still _much_ faster -- On an AMD Ryzen 3600, + * XXH3_64bits @ len=240 runs at 4.6 GB/s with Clang 9, but 3.3 GB/s on + * GCC 9.2, despite both emitting scalar code. + * + * GCC generates much better scalar code than Clang for the rest of XXH3, + * which is why finding a more optimal codepath is an interest. + */ + XXH_COMPILER_GUARD(seed64); +#endif + { xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 const input_hi = XXH_readLE64(input+8); + return XXH3_mul128_fold64( + input_lo ^ (XXH_readLE64(secret) + seed64), + input_hi ^ (XXH_readLE64(secret+8) - seed64) + ); + } +} + +/* For mid range keys, XXH3 uses a Mum-hash variant. */ +XXH_FORCE_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { xxh_u64 acc = len * XXH_PRIME64_1; +#if XXH_SIZE_OPT >= 1 + /* Smaller and cleaner, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc += XXH3_mix16B(input+16 * i, secret+32*i, seed); + acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed); + } while (i-- != 0); +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc += XXH3_mix16B(input+48, secret+96, seed); + acc += XXH3_mix16B(input+len-64, secret+112, seed); + } + acc += XXH3_mix16B(input+32, secret+64, seed); + acc += XXH3_mix16B(input+len-48, secret+80, seed); + } + acc += XXH3_mix16B(input+16, secret+32, seed); + acc += XXH3_mix16B(input+len-32, secret+48, seed); + } + acc += XXH3_mix16B(input+0, secret+0, seed); + acc += XXH3_mix16B(input+len-16, secret+16, seed); +#endif + return XXH3_avalanche(acc); + } +} + +#define XXH3_MIDSIZE_MAX 240 + +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_len_129to240_64b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + #define XXH3_MIDSIZE_STARTOFFSET 3 + #define XXH3_MIDSIZE_LASTOFFSET 17 + + { xxh_u64 acc = len * XXH_PRIME64_1; + xxh_u64 acc_end; + unsigned int const nbRounds = (unsigned int)len / 16; + unsigned int i; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + for (i=0; i<8; i++) { + acc += XXH3_mix16B(input+(16*i), secret+(16*i), seed); + } + /* last bytes */ + acc_end = XXH3_mix16B(input + len - 16, secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET, seed); + XXH_ASSERT(nbRounds >= 8); + acc = XXH3_avalanche(acc); +#if defined(__clang__) /* Clang */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Clang for ARMv7-A tries to vectorize this loop, similar to GCC x86. + * In everywhere else, it uses scalar code. + * + * For 64->128-bit multiplies, even if the NEON was 100% optimal, it + * would still be slower than UMAAL (see XXH_mult64to128). + * + * Unfortunately, Clang doesn't handle the long multiplies properly and + * converts them to the nonexistent "vmulq_u64" intrinsic, which is then + * scalarized into an ugly mess of VMOV.32 instructions. + * + * This mess is difficult to avoid without turning autovectorization + * off completely, but they are usually relatively minor and/or not + * worth it to fix. + * + * This loop is the easiest to fix, as unlike XXH32, this pragma + * _actually works_ because it is a loop vectorization instead of an + * SLP vectorization. + */ + #pragma clang loop vectorize(disable) +#endif + for (i=8 ; i < nbRounds; i++) { + /* + * Prevents clang for unrolling the acc loop and interleaving with this one. + */ + XXH_COMPILER_GUARD(acc); + acc_end += XXH3_mix16B(input+(16*i), secret+(16*(i-8)) + XXH3_MIDSIZE_STARTOFFSET, seed); + } + return XXH3_avalanche(acc + acc_end); + } +} + + +/* ======= Long Keys ======= */ + +#define XXH_STRIPE_LEN 64 +#define XXH_SECRET_CONSUME_RATE 8 /* nb of secret bytes consumed at each accumulation */ +#define XXH_ACC_NB (XXH_STRIPE_LEN / sizeof(xxh_u64)) + +#ifdef XXH_OLD_NAMES +# define STRIPE_LEN XXH_STRIPE_LEN +# define ACC_NB XXH_ACC_NB +#endif + +#ifndef XXH_PREFETCH_DIST +# ifdef __clang__ +# define XXH_PREFETCH_DIST 320 +# else +# if (XXH_VECTOR == XXH_AVX512) +# define XXH_PREFETCH_DIST 512 +# else +# define XXH_PREFETCH_DIST 384 +# endif +# endif /* __clang__ */ +#endif /* XXH_PREFETCH_DIST */ + +/* + * These macros are to generate an XXH3_accumulate() function. + * The two arguments select the name suffix and target attribute. + * + * The name of this symbol is XXH3_accumulate_<name>() and it calls + * XXH3_accumulate_512_<name>(). + * + * It may be useful to hand implement this function if the compiler fails to + * optimize the inline function. + */ +#define XXH3_ACCUMULATE_TEMPLATE(name) \ +void \ +XXH3_accumulate_##name(xxh_u64* XXH_RESTRICT acc, \ + const xxh_u8* XXH_RESTRICT input, \ + const xxh_u8* XXH_RESTRICT secret, \ + size_t nbStripes) \ +{ \ + size_t n; \ + for (n = 0; n < nbStripes; n++ ) { \ + const xxh_u8* const in = input + n*XXH_STRIPE_LEN; \ + XXH_PREFETCH(in + XXH_PREFETCH_DIST); \ + XXH3_accumulate_512_##name( \ + acc, \ + in, \ + secret + n*XXH_SECRET_CONSUME_RATE); \ + } \ +} + + +XXH_FORCE_INLINE void XXH_writeLE64(void* dst, xxh_u64 v64) +{ + if (!XXH_CPU_LITTLE_ENDIAN) v64 = XXH_swap64(v64); + XXH_memcpy(dst, &v64, sizeof(v64)); +} + +/* Several intrinsic functions below are supposed to accept __int64 as argument, + * as documented in https://software.intel.com/sites/landingpage/IntrinsicsGuide/ . + * However, several environments do not define __int64 type, + * requiring a workaround. + */ +#if !defined (__VMS) \ + && (defined (__cplusplus) \ + || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) + typedef int64_t xxh_i64; +#else + /* the following type must have a width of 64-bit */ + typedef long long xxh_i64; +#endif + + +/* + * XXH3_accumulate_512 is the tightest loop for long inputs, and it is the most optimized. + * + * It is a hardened version of UMAC, based off of FARSH's implementation. + * + * This was chosen because it adapts quite well to 32-bit, 64-bit, and SIMD + * implementations, and it is ridiculously fast. + * + * We harden it by mixing the original input to the accumulators as well as the product. + * + * This means that in the (relatively likely) case of a multiply by zero, the + * original input is preserved. + * + * On 128-bit inputs, we swap 64-bit pairs when we add the input to improve + * cross-pollination, as otherwise the upper and lower halves would be + * essentially independent. + * + * This doesn't matter on 64-bit hashes since they all get merged together in + * the end, so we skip the extra step. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +#if (XXH_VECTOR == XXH_AVX512) \ + || (defined(XXH_DISPATCH_AVX512) && XXH_DISPATCH_AVX512 != 0) + +#ifndef XXH_TARGET_AVX512 +# define XXH_TARGET_AVX512 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_accumulate_512_avx512(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + __m512i* const xacc = (__m512i *) acc; + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + + { + /* data_vec = input[0]; */ + __m512i const data_vec = _mm512_loadu_si512 (input); + /* key_vec = secret[0]; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + /* data_key = data_vec ^ key_vec; */ + __m512i const data_key = _mm512_xor_si512 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m512i const data_key_lo = _mm512_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m512i const product = _mm512_mul_epu32 (data_key, data_key_lo); + /* xacc[0] += swap(data_vec); */ + __m512i const data_swap = _mm512_shuffle_epi32(data_vec, (_MM_PERM_ENUM)_MM_SHUFFLE(1, 0, 3, 2)); + __m512i const sum = _mm512_add_epi64(*xacc, data_swap); + /* xacc[0] += product; */ + *xacc = _mm512_add_epi64(product, sum); + } +} +XXH_FORCE_INLINE XXH_TARGET_AVX512 XXH3_ACCUMULATE_TEMPLATE(avx512) + +/* + * XXH3_scrambleAcc: Scrambles the accumulators to improve mixing. + * + * Multiplication isn't perfect, as explained by Google in HighwayHash: + * + * // Multiplication mixes/scrambles bytes 0-7 of the 64-bit result to + * // varying degrees. In descending order of goodness, bytes + * // 3 4 2 5 1 6 0 7 have quality 228 224 164 160 100 96 36 32. + * // As expected, the upper and lower bytes are much worse. + * + * Source: https://github.com/google/highwayhash/blob/0aaf66b/highwayhash/hh_avx2.h#L291 + * + * Since our algorithm uses a pseudorandom secret to add some variance into the + * mix, we don't need to (or want to) mix as often or as much as HighwayHash does. + * + * This isn't as tight as XXH3_accumulate, but still written in SIMD to avoid + * extraction. + * + * Both XXH3_64bits and XXH3_128bits use this subroutine. + */ + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_scrambleAcc_avx512(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 63) == 0); + XXH_STATIC_ASSERT(XXH_STRIPE_LEN == sizeof(__m512i)); + { __m512i* const xacc = (__m512i*) acc; + const __m512i prime32 = _mm512_set1_epi32((int)XXH_PRIME32_1); + + /* xacc[0] ^= (xacc[0] >> 47) */ + __m512i const acc_vec = *xacc; + __m512i const shifted = _mm512_srli_epi64 (acc_vec, 47); + /* xacc[0] ^= secret; */ + __m512i const key_vec = _mm512_loadu_si512 (secret); + __m512i const data_key = _mm512_ternarylogic_epi32(key_vec, acc_vec, shifted, 0x96 /* key_vec ^ acc_vec ^ shifted */); + + /* xacc[0] *= XXH_PRIME32_1; */ + __m512i const data_key_hi = _mm512_srli_epi64 (data_key, 32); + __m512i const prod_lo = _mm512_mul_epu32 (data_key, prime32); + __m512i const prod_hi = _mm512_mul_epu32 (data_key_hi, prime32); + *xacc = _mm512_add_epi64(prod_lo, _mm512_slli_epi64(prod_hi, 32)); + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX512 void +XXH3_initCustomSecret_avx512(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 63) == 0); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN == 64); + XXH_ASSERT(((size_t)customSecret & 63) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m512i); + __m512i const seed_pos = _mm512_set1_epi64((xxh_i64)seed64); + __m512i const seed = _mm512_mask_sub_epi64(seed_pos, 0xAA, _mm512_set1_epi8(0), seed_pos); + + const __m512i* const src = (const __m512i*) ((const void*) XXH3_kSecret); + __m512i* const dest = ( __m512i*) customSecret; + int i; + XXH_ASSERT(((size_t)src & 63) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 63) == 0); + for (i=0; i < nbRounds; ++i) { + dest[i] = _mm512_add_epi64(_mm512_load_si512(src + i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_AVX2) \ + || (defined(XXH_DISPATCH_AVX2) && XXH_DISPATCH_AVX2 != 0) + +#ifndef XXH_TARGET_AVX2 +# define XXH_TARGET_AVX2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_accumulate_512_avx2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xinput = (const __m256i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* data_vec = xinput[i]; */ + __m256i const data_vec = _mm256_loadu_si256 (xinput+i); + /* key_vec = xsecret[i]; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m256i const data_key_lo = _mm256_srli_epi64 (data_key, 32); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m256i const product = _mm256_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m256i const data_swap = _mm256_shuffle_epi32(data_vec, _MM_SHUFFLE(1, 0, 3, 2)); + __m256i const sum = _mm256_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm256_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_AVX2 XXH3_ACCUMULATE_TEMPLATE(avx2) + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void +XXH3_scrambleAcc_avx2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 31) == 0); + { __m256i* const xacc = (__m256i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm256_loadu_si256 requires a const __m256i * pointer for some reason. */ + const __m256i* const xsecret = (const __m256i *) secret; + const __m256i prime32 = _mm256_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m256i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m256i const acc_vec = xacc[i]; + __m256i const shifted = _mm256_srli_epi64 (acc_vec, 47); + __m256i const data_vec = _mm256_xor_si256 (acc_vec, shifted); + /* xacc[i] ^= xsecret; */ + __m256i const key_vec = _mm256_loadu_si256 (xsecret+i); + __m256i const data_key = _mm256_xor_si256 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m256i const data_key_hi = _mm256_srli_epi64 (data_key, 32); + __m256i const prod_lo = _mm256_mul_epu32 (data_key, prime32); + __m256i const prod_hi = _mm256_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm256_add_epi64(prod_lo, _mm256_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_AVX2 void XXH3_initCustomSecret_avx2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 31) == 0); + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE / sizeof(__m256i)) == 6); + XXH_STATIC_ASSERT(XXH_SEC_ALIGN <= 64); + (void)(&XXH_writeLE64); + XXH_PREFETCH(customSecret); + { __m256i const seed = _mm256_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64, (xxh_i64)(0U - seed64), (xxh_i64)seed64); + + const __m256i* const src = (const __m256i*) ((const void*) XXH3_kSecret); + __m256i* dest = ( __m256i*) customSecret; + +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dest); +# endif + XXH_ASSERT(((size_t)src & 31) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dest & 31) == 0); + + /* GCC -O2 need unroll loop manually */ + dest[0] = _mm256_add_epi64(_mm256_load_si256(src+0), seed); + dest[1] = _mm256_add_epi64(_mm256_load_si256(src+1), seed); + dest[2] = _mm256_add_epi64(_mm256_load_si256(src+2), seed); + dest[3] = _mm256_add_epi64(_mm256_load_si256(src+3), seed); + dest[4] = _mm256_add_epi64(_mm256_load_si256(src+4), seed); + dest[5] = _mm256_add_epi64(_mm256_load_si256(src+5), seed); + } +} + +#endif + +/* x86dispatch always generates SSE2 */ +#if (XXH_VECTOR == XXH_SSE2) || defined(XXH_X86DISPATCH) + +#ifndef XXH_TARGET_SSE2 +# define XXH_TARGET_SSE2 /* disable attribute target */ +#endif + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_accumulate_512_sse2( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* SSE2 is just a half-scale version of the AVX2 version. */ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i *) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xinput = (const __m128i *) input; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* data_vec = xinput[i]; */ + __m128i const data_vec = _mm_loadu_si128 (xinput+i); + /* key_vec = xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + /* data_key = data_vec ^ key_vec; */ + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + /* data_key_lo = data_key >> 32; */ + __m128i const data_key_lo = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + /* product = (data_key & 0xffffffff) * (data_key_lo & 0xffffffff); */ + __m128i const product = _mm_mul_epu32 (data_key, data_key_lo); + /* xacc[i] += swap(data_vec); */ + __m128i const data_swap = _mm_shuffle_epi32(data_vec, _MM_SHUFFLE(1,0,3,2)); + __m128i const sum = _mm_add_epi64(xacc[i], data_swap); + /* xacc[i] += product; */ + xacc[i] = _mm_add_epi64(product, sum); + } } +} +XXH_FORCE_INLINE XXH_TARGET_SSE2 XXH3_ACCUMULATE_TEMPLATE(sse2) + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void +XXH3_scrambleAcc_sse2(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + { __m128i* const xacc = (__m128i*) acc; + /* Unaligned. This is mainly for pointer arithmetic, and because + * _mm_loadu_si128 requires a const __m128i * pointer for some reason. */ + const __m128i* const xsecret = (const __m128i *) secret; + const __m128i prime32 = _mm_set1_epi32((int)XXH_PRIME32_1); + + size_t i; + for (i=0; i < XXH_STRIPE_LEN/sizeof(__m128i); i++) { + /* xacc[i] ^= (xacc[i] >> 47) */ + __m128i const acc_vec = xacc[i]; + __m128i const shifted = _mm_srli_epi64 (acc_vec, 47); + __m128i const data_vec = _mm_xor_si128 (acc_vec, shifted); + /* xacc[i] ^= xsecret[i]; */ + __m128i const key_vec = _mm_loadu_si128 (xsecret+i); + __m128i const data_key = _mm_xor_si128 (data_vec, key_vec); + + /* xacc[i] *= XXH_PRIME32_1; */ + __m128i const data_key_hi = _mm_shuffle_epi32 (data_key, _MM_SHUFFLE(0, 3, 0, 1)); + __m128i const prod_lo = _mm_mul_epu32 (data_key, prime32); + __m128i const prod_hi = _mm_mul_epu32 (data_key_hi, prime32); + xacc[i] = _mm_add_epi64(prod_lo, _mm_slli_epi64(prod_hi, 32)); + } + } +} + +XXH_FORCE_INLINE XXH_TARGET_SSE2 void XXH3_initCustomSecret_sse2(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + (void)(&XXH_writeLE64); + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / sizeof(__m128i); + +# if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900 + /* MSVC 32bit mode does not support _mm_set_epi64x before 2015 */ + XXH_ALIGN(16) const xxh_i64 seed64x2[2] = { (xxh_i64)seed64, (xxh_i64)(0U - seed64) }; + __m128i const seed = _mm_load_si128((__m128i const*)seed64x2); +# else + __m128i const seed = _mm_set_epi64x((xxh_i64)(0U - seed64), (xxh_i64)seed64); +# endif + int i; + + const void* const src16 = XXH3_kSecret; + __m128i* dst16 = (__m128i*) customSecret; +# if defined(__GNUC__) || defined(__clang__) + /* + * On GCC & Clang, marking 'dest' as modified will cause the compiler: + * - do not extract the secret from sse registers in the internal loop + * - use less common registers, and avoid pushing these reg into stack + */ + XXH_COMPILER_GUARD(dst16); +# endif + XXH_ASSERT(((size_t)src16 & 15) == 0); /* control alignment */ + XXH_ASSERT(((size_t)dst16 & 15) == 0); + + for (i=0; i < nbRounds; ++i) { + dst16[i] = _mm_add_epi64(_mm_load_si128((const __m128i *)src16+i), seed); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_NEON) + +/* forward declarations for the scalar routines */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, size_t lane); + +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, size_t lane); + +/*! + * @internal + * @brief The bulk processing loop for NEON and WASM SIMD128. + * + * The NEON code path is actually partially scalar when running on AArch64. This + * is to optimize the pipelining and can have up to 15% speedup depending on the + * CPU, and it also mitigates some GCC codegen issues. + * + * @see XXH3_NEON_LANES for configuring this and details about this optimization. + * + * NEON's 32-bit to 64-bit long multiply takes a half vector of 32-bit + * integers instead of the other platforms which mask full 64-bit vectors, + * so the setup is more complicated than just shifting right. + * + * Additionally, there is an optimization for 4 lanes at once noted below. + * + * Since, as stated, the most optimal amount of lanes for Cortexes is 6, + * there needs to be *three* versions of the accumulate operation used + * for the remaining 2 lanes. + * + * WASM's SIMD128 uses SIMDe's arm_neon.h polyfill because the intrinsics overlap + * nearly perfectly. + */ + +XXH_FORCE_INLINE void +XXH3_accumulate_512_neon( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + XXH_STATIC_ASSERT(XXH3_NEON_LANES > 0 && XXH3_NEON_LANES <= XXH_ACC_NB && XXH3_NEON_LANES % 2 == 0); + { /* GCC for darwin arm64 does not like aliasing here */ + xxh_aliasing_uint64x2_t* const xacc = (xxh_aliasing_uint64x2_t*) acc; + /* We don't use a uint32x4_t pointer because it causes bus errors on ARMv7. */ + uint8_t const* xinput = (const uint8_t *) input; + uint8_t const* xsecret = (const uint8_t *) secret; + + size_t i; +#ifdef __wasm_simd128__ + /* + * On WASM SIMD128, Clang emits direct address loads when XXH3_kSecret + * is constant propagated, which results in it converting it to this + * inside the loop: + * + * a = v128.load(XXH3_kSecret + 0 + $secret_offset, offset = 0) + * b = v128.load(XXH3_kSecret + 16 + $secret_offset, offset = 0) + * ... + * + * This requires a full 32-bit address immediate (and therefore a 6 byte + * instruction) as well as an add for each offset. + * + * Putting an asm guard prevents it from folding (at the cost of losing + * the alignment hint), and uses the free offset in `v128.load` instead + * of adding secret_offset each time which overall reduces code size by + * about a kilobyte and improves performance. + */ + XXH_COMPILER_GUARD(xsecret); +#endif + /* Scalar lanes use the normal scalarRound routine */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } + i = 0; + /* 4 NEON lanes at a time. */ + for (; i+1 < XXH3_NEON_LANES / 2; i+=2) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec_1 = XXH_vld1q_u64(xinput + (i * 16)); + uint64x2_t data_vec_2 = XXH_vld1q_u64(xinput + ((i+1) * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec_1 = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t key_vec_2 = XXH_vld1q_u64(xsecret + ((i+1) * 16)); + /* data_swap = swap(data_vec) */ + uint64x2_t data_swap_1 = vextq_u64(data_vec_1, data_vec_1, 1); + uint64x2_t data_swap_2 = vextq_u64(data_vec_2, data_vec_2, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key_1 = veorq_u64(data_vec_1, key_vec_1); + uint64x2_t data_key_2 = veorq_u64(data_vec_2, key_vec_2); + + /* + * If we reinterpret the 64x2 vectors as 32x4 vectors, we can use a + * de-interleave operation for 4 lanes in 1 step with `vuzpq_u32` to + * get one vector with the low 32 bits of each lane, and one vector + * with the high 32 bits of each lane. + * + * The intrinsic returns a double vector because the original ARMv7-a + * instruction modified both arguments in place. AArch64 and SIMD128 emit + * two instructions from this intrinsic. + * + * [ dk11L | dk11H | dk12L | dk12H ] -> [ dk11L | dk12L | dk21L | dk22L ] + * [ dk21L | dk21H | dk22L | dk22H ] -> [ dk11H | dk12H | dk21H | dk22H ] + */ + uint32x4x2_t unzipped = vuzpq_u32( + vreinterpretq_u32_u64(data_key_1), + vreinterpretq_u32_u64(data_key_2) + ); + /* data_key_lo = data_key & 0xFFFFFFFF */ + uint32x4_t data_key_lo = unzipped.val[0]; + /* data_key_hi = data_key >> 32 */ + uint32x4_t data_key_hi = unzipped.val[1]; + /* + * Then, we can split the vectors horizontally and multiply which, as for most + * widening intrinsics, have a variant that works on both high half vectors + * for free on AArch64. A similar instruction is available on SIMD128. + * + * sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi + */ + uint64x2_t sum_1 = XXH_vmlal_low_u32(data_swap_1, data_key_lo, data_key_hi); + uint64x2_t sum_2 = XXH_vmlal_high_u32(data_swap_2, data_key_lo, data_key_hi); + /* + * Clang reorders + * a += b * c; // umlal swap.2d, dkl.2s, dkh.2s + * c += a; // add acc.2d, acc.2d, swap.2d + * to + * c += a; // add acc.2d, acc.2d, swap.2d + * c += b * c; // umlal acc.2d, dkl.2s, dkh.2s + * + * While it would make sense in theory since the addition is faster, + * for reasons likely related to umlal being limited to certain NEON + * pipelines, this is worse. A compiler guard fixes this. + */ + XXH_COMPILER_GUARD_CLANG_NEON(sum_1); + XXH_COMPILER_GUARD_CLANG_NEON(sum_2); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64(xacc[i], sum_1); + xacc[i+1] = vaddq_u64(xacc[i+1], sum_2); + } + /* Operate on the remaining NEON lanes 2 at a time. */ + for (; i < XXH3_NEON_LANES / 2; i++) { + /* data_vec = xinput[i]; */ + uint64x2_t data_vec = XXH_vld1q_u64(xinput + (i * 16)); + /* key_vec = xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + /* acc_vec_2 = swap(data_vec) */ + uint64x2_t data_swap = vextq_u64(data_vec, data_vec, 1); + /* data_key = data_vec ^ key_vec; */ + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* For two lanes, just use VMOVN and VSHRN. */ + /* data_key_lo = data_key & 0xFFFFFFFF; */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* data_key_hi = data_key >> 32; */ + uint32x2_t data_key_hi = vshrn_n_u64(data_key, 32); + /* sum = data_swap + (u64x2) data_key_lo * (u64x2) data_key_hi; */ + uint64x2_t sum = vmlal_u32(data_swap, data_key_lo, data_key_hi); + /* Same Clang workaround as before */ + XXH_COMPILER_GUARD_CLANG_NEON(sum); + /* xacc[i] = acc_vec + sum; */ + xacc[i] = vaddq_u64 (xacc[i], sum); + } + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(neon) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_neon(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_uint64x2_t* xacc = (xxh_aliasing_uint64x2_t*) acc; + uint8_t const* xsecret = (uint8_t const*) secret; + + size_t i; + /* WASM uses operator overloads and doesn't need these. */ +#ifndef __wasm_simd128__ + /* { prime32_1, prime32_1 } */ + uint32x2_t const kPrimeLo = vdup_n_u32(XXH_PRIME32_1); + /* { 0, prime32_1, 0, prime32_1 } */ + uint32x4_t const kPrimeHi = vreinterpretq_u32_u64(vdupq_n_u64((xxh_u64)XXH_PRIME32_1 << 32)); +#endif + + /* AArch64 uses both scalar and neon at the same time */ + for (i = XXH3_NEON_LANES; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } + for (i=0; i < XXH3_NEON_LANES / 2; i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + uint64x2_t acc_vec = xacc[i]; + uint64x2_t shifted = vshrq_n_u64(acc_vec, 47); + uint64x2_t data_vec = veorq_u64(acc_vec, shifted); + + /* xacc[i] ^= xsecret[i]; */ + uint64x2_t key_vec = XXH_vld1q_u64(xsecret + (i * 16)); + uint64x2_t data_key = veorq_u64(data_vec, key_vec); + /* xacc[i] *= XXH_PRIME32_1 */ +#ifdef __wasm_simd128__ + /* SIMD128 has multiply by u64x2, use it instead of expanding and scalarizing */ + xacc[i] = data_key * XXH_PRIME32_1; +#else + /* + * Expanded version with portable NEON intrinsics + * + * lo(x) * lo(y) + (hi(x) * lo(y) << 32) + * + * prod_hi = hi(data_key) * lo(prime) << 32 + * + * Since we only need 32 bits of this multiply a trick can be used, reinterpreting the vector + * as a uint32x4_t and multiplying by { 0, prime, 0, prime } to cancel out the unwanted bits + * and avoid the shift. + */ + uint32x4_t prod_hi = vmulq_u32 (vreinterpretq_u32_u64(data_key), kPrimeHi); + /* Extract low bits for vmlal_u32 */ + uint32x2_t data_key_lo = vmovn_u64(data_key); + /* xacc[i] = prod_hi + lo(data_key) * XXH_PRIME32_1; */ + xacc[i] = vmlal_u32(vreinterpretq_u64_u32(prod_hi), data_key_lo, kPrimeLo); +#endif + } + } +} +#endif + +#if (XXH_VECTOR == XXH_VSX) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_vsx( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + /* presumed aligned */ + xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + xxh_u8 const* const xinput = (xxh_u8 const*) input; /* no alignment restriction */ + xxh_u8 const* const xsecret = (xxh_u8 const*) secret; /* no alignment restriction */ + xxh_u64x2 const v32 = { 32, 32 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* data_vec = xinput[i]; */ + xxh_u64x2 const data_vec = XXH_vec_loadu(xinput + 16*i); + /* key_vec = xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + /* shuffled = (data_key << 32) | (data_key >> 32); */ + xxh_u32x4 const shuffled = (xxh_u32x4)vec_rl(data_key, v32); + /* product = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)shuffled & 0xFFFFFFFF); */ + xxh_u64x2 const product = XXH_vec_mulo((xxh_u32x4)data_key, shuffled); + /* acc_vec = xacc[i]; */ + xxh_u64x2 acc_vec = xacc[i]; + acc_vec += product; + + /* swap high and low halves */ +#ifdef __s390x__ + acc_vec += vec_permi(data_vec, data_vec, 2); +#else + acc_vec += vec_xxpermdi(data_vec, data_vec, 2); +#endif + xacc[i] = acc_vec; + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(vsx) + +XXH_FORCE_INLINE void +XXH3_scrambleAcc_vsx(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + XXH_ASSERT((((size_t)acc) & 15) == 0); + + { xxh_aliasing_u64x2* const xacc = (xxh_aliasing_u64x2*) acc; + const xxh_u8* const xsecret = (const xxh_u8*) secret; + /* constants */ + xxh_u64x2 const v32 = { 32, 32 }; + xxh_u64x2 const v47 = { 47, 47 }; + xxh_u32x4 const prime = { XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1, XXH_PRIME32_1 }; + size_t i; + for (i = 0; i < XXH_STRIPE_LEN / sizeof(xxh_u64x2); i++) { + /* xacc[i] ^= (xacc[i] >> 47); */ + xxh_u64x2 const acc_vec = xacc[i]; + xxh_u64x2 const data_vec = acc_vec ^ (acc_vec >> v47); + + /* xacc[i] ^= xsecret[i]; */ + xxh_u64x2 const key_vec = XXH_vec_loadu(xsecret + 16*i); + xxh_u64x2 const data_key = data_vec ^ key_vec; + + /* xacc[i] *= XXH_PRIME32_1 */ + /* prod_lo = ((xxh_u64x2)data_key & 0xFFFFFFFF) * ((xxh_u64x2)prime & 0xFFFFFFFF); */ + xxh_u64x2 const prod_even = XXH_vec_mule((xxh_u32x4)data_key, prime); + /* prod_hi = ((xxh_u64x2)data_key >> 32) * ((xxh_u64x2)prime >> 32); */ + xxh_u64x2 const prod_odd = XXH_vec_mulo((xxh_u32x4)data_key, prime); + xacc[i] = prod_odd + (prod_even << v32); + } } +} + +#endif + +#if (XXH_VECTOR == XXH_SVE) + +XXH_FORCE_INLINE void +XXH3_accumulate_512_sve( void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc); + ACCRND(vacc, 0); + svst1_u64(mask, xacc, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } +} + +XXH_FORCE_INLINE void +XXH3_accumulate_sve(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, + const xxh_u8* XXH_RESTRICT secret, + size_t nbStripes) +{ + if (nbStripes != 0) { + uint64_t *xacc = (uint64_t *)acc; + const uint64_t *xinput = (const uint64_t *)(const void *)input; + const uint64_t *xsecret = (const uint64_t *)(const void *)secret; + svuint64_t kSwap = sveor_n_u64_z(svptrue_b64(), svindex_u64(0, 1), 1); + uint64_t element_count = svcntd(); + if (element_count >= 8) { + svbool_t mask = svptrue_pat_b64(SV_VL8); + svuint64_t vacc = svld1_u64(mask, xacc + 0); + do { + /* svprfd(svbool_t, void *, enum svfprop); */ + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(vacc, 0); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, vacc); + } else if (element_count == 2) { /* sve128 */ + svbool_t mask = svptrue_pat_b64(SV_VL2); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 2); + svuint64_t acc2 = svld1_u64(mask, xacc + 4); + svuint64_t acc3 = svld1_u64(mask, xacc + 6); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 2); + ACCRND(acc2, 4); + ACCRND(acc3, 6); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 2, acc1); + svst1_u64(mask, xacc + 4, acc2); + svst1_u64(mask, xacc + 6, acc3); + } else { + svbool_t mask = svptrue_pat_b64(SV_VL4); + svuint64_t acc0 = svld1_u64(mask, xacc + 0); + svuint64_t acc1 = svld1_u64(mask, xacc + 4); + do { + svprfd(mask, xinput + 128, SV_PLDL1STRM); + ACCRND(acc0, 0); + ACCRND(acc1, 4); + xinput += 8; + xsecret += 1; + nbStripes--; + } while (nbStripes != 0); + + svst1_u64(mask, xacc + 0, acc0); + svst1_u64(mask, xacc + 4, acc1); + } + } +} + +#endif + +/* scalar variants - universal */ + +#if defined(__aarch64__) && (defined(__GNUC__) || defined(__clang__)) +/* + * In XXH3_scalarRound(), GCC and Clang have a similar codegen issue, where they + * emit an excess mask and a full 64-bit multiply-add (MADD X-form). + * + * While this might not seem like much, as AArch64 is a 64-bit architecture, only + * big Cortex designs have a full 64-bit multiplier. + * + * On the little cores, the smaller 32-bit multiplier is used, and full 64-bit + * multiplies expand to 2-3 multiplies in microcode. This has a major penalty + * of up to 4 latency cycles and 2 stall cycles in the multiply pipeline. + * + * Thankfully, AArch64 still provides the 32-bit long multiply-add (UMADDL) which does + * not have this penalty and does the mask automatically. + */ +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + xxh_u64 ret; + /* note: %x = 64-bit register, %w = 32-bit register */ + __asm__("umaddl %x0, %w1, %w2, %x3" : "=r" (ret) : "r" (lhs), "r" (rhs), "r" (acc)); + return ret; +} +#else +XXH_FORCE_INLINE xxh_u64 +XXH_mult32to64_add64(xxh_u64 lhs, xxh_u64 rhs, xxh_u64 acc) +{ + return XXH_mult32to64((xxh_u32)lhs, (xxh_u32)rhs) + acc; +} +#endif + +/*! + * @internal + * @brief Scalar round for @ref XXH3_accumulate_512_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT input, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* xacc = (xxh_u64*) acc; + xxh_u8 const* xinput = (xxh_u8 const*) input; + xxh_u8 const* xsecret = (xxh_u8 const*) secret; + XXH_ASSERT(lane < XXH_ACC_NB); + XXH_ASSERT(((size_t)acc & (XXH_ACC_ALIGN-1)) == 0); + { + xxh_u64 const data_val = XXH_readLE64(xinput + lane * 8); + xxh_u64 const data_key = data_val ^ XXH_readLE64(xsecret + lane * 8); + xacc[lane ^ 1] += data_val; /* swap adjacent lanes */ + xacc[lane] = XXH_mult32to64_add64(data_key /* & 0xFFFFFFFF */, data_key >> 32, xacc[lane]); + } +} + +/*! + * @internal + * @brief Processes a 64 byte block of data using the scalar path. + */ +XXH_FORCE_INLINE void +XXH3_accumulate_512_scalar(void* XXH_RESTRICT acc, + const void* XXH_RESTRICT input, + const void* XXH_RESTRICT secret) +{ + size_t i; + /* ARM GCC refuses to unroll this loop, resulting in a 24% slowdown on ARMv6. */ +#if defined(__GNUC__) && !defined(__clang__) \ + && (defined(__arm__) || defined(__thumb2__)) \ + && defined(__ARM_FEATURE_UNALIGNED) /* no unaligned access just wastes bytes */ \ + && XXH_SIZE_OPT <= 0 +# pragma GCC unroll 8 +#endif + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarRound(acc, input, secret, i); + } +} +XXH_FORCE_INLINE XXH3_ACCUMULATE_TEMPLATE(scalar) + +/*! + * @internal + * @brief Scalar scramble step for @ref XXH3_scrambleAcc_scalar(). + * + * This is extracted to its own function because the NEON path uses a combination + * of NEON and scalar. + */ +XXH_FORCE_INLINE void +XXH3_scalarScrambleRound(void* XXH_RESTRICT acc, + void const* XXH_RESTRICT secret, + size_t lane) +{ + xxh_u64* const xacc = (xxh_u64*) acc; /* presumed aligned */ + const xxh_u8* const xsecret = (const xxh_u8*) secret; /* no alignment restriction */ + XXH_ASSERT((((size_t)acc) & (XXH_ACC_ALIGN-1)) == 0); + XXH_ASSERT(lane < XXH_ACC_NB); + { + xxh_u64 const key64 = XXH_readLE64(xsecret + lane * 8); + xxh_u64 acc64 = xacc[lane]; + acc64 = XXH_xorshift64(acc64, 47); + acc64 ^= key64; + acc64 *= XXH_PRIME32_1; + xacc[lane] = acc64; + } +} + +/*! + * @internal + * @brief Scrambles the accumulators after a large chunk has been read + */ +XXH_FORCE_INLINE void +XXH3_scrambleAcc_scalar(void* XXH_RESTRICT acc, const void* XXH_RESTRICT secret) +{ + size_t i; + for (i=0; i < XXH_ACC_NB; i++) { + XXH3_scalarScrambleRound(acc, secret, i); + } +} + +XXH_FORCE_INLINE void +XXH3_initCustomSecret_scalar(void* XXH_RESTRICT customSecret, xxh_u64 seed64) +{ + /* + * We need a separate pointer for the hack below, + * which requires a non-const pointer. + * Any decent compiler will optimize this out otherwise. + */ + const xxh_u8* kSecretPtr = XXH3_kSecret; + XXH_STATIC_ASSERT((XXH_SECRET_DEFAULT_SIZE & 15) == 0); + +#if defined(__GNUC__) && defined(__aarch64__) + /* + * UGLY HACK: + * GCC and Clang generate a bunch of MOV/MOVK pairs for aarch64, and they are + * placed sequentially, in order, at the top of the unrolled loop. + * + * While MOVK is great for generating constants (2 cycles for a 64-bit + * constant compared to 4 cycles for LDR), it fights for bandwidth with + * the arithmetic instructions. + * + * I L S + * MOVK + * MOVK + * MOVK + * MOVK + * ADD + * SUB STR + * STR + * By forcing loads from memory (as the asm line causes the compiler to assume + * that XXH3_kSecretPtr has been changed), the pipelines are used more + * efficiently: + * I L S + * LDR + * ADD LDR + * SUB STR + * STR + * + * See XXH3_NEON_LANES for details on the pipsline. + * + * XXH3_64bits_withSeed, len == 256, Snapdragon 835 + * without hack: 2654.4 MB/s + * with hack: 3202.9 MB/s + */ + XXH_COMPILER_GUARD(kSecretPtr); +#endif + { int const nbRounds = XXH_SECRET_DEFAULT_SIZE / 16; + int i; + for (i=0; i < nbRounds; i++) { + /* + * The asm hack causes the compiler to assume that kSecretPtr aliases with + * customSecret, and on aarch64, this prevented LDP from merging two + * loads together for free. Putting the loads together before the stores + * properly generates LDP. + */ + xxh_u64 lo = XXH_readLE64(kSecretPtr + 16*i) + seed64; + xxh_u64 hi = XXH_readLE64(kSecretPtr + 16*i + 8) - seed64; + XXH_writeLE64((xxh_u8*)customSecret + 16*i, lo); + XXH_writeLE64((xxh_u8*)customSecret + 16*i + 8, hi); + } } +} + + +typedef void (*XXH3_f_accumulate)(xxh_u64* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, const xxh_u8* XXH_RESTRICT, size_t); +typedef void (*XXH3_f_scrambleAcc)(void* XXH_RESTRICT, const void*); +typedef void (*XXH3_f_initCustomSecret)(void* XXH_RESTRICT, xxh_u64); + + +#if (XXH_VECTOR == XXH_AVX512) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx512 +#define XXH3_accumulate XXH3_accumulate_avx512 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx512 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx512 + +#elif (XXH_VECTOR == XXH_AVX2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_avx2 +#define XXH3_accumulate XXH3_accumulate_avx2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_avx2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_avx2 + +#elif (XXH_VECTOR == XXH_SSE2) + +#define XXH3_accumulate_512 XXH3_accumulate_512_sse2 +#define XXH3_accumulate XXH3_accumulate_sse2 +#define XXH3_scrambleAcc XXH3_scrambleAcc_sse2 +#define XXH3_initCustomSecret XXH3_initCustomSecret_sse2 + +#elif (XXH_VECTOR == XXH_NEON) + +#define XXH3_accumulate_512 XXH3_accumulate_512_neon +#define XXH3_accumulate XXH3_accumulate_neon +#define XXH3_scrambleAcc XXH3_scrambleAcc_neon +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_VSX) + +#define XXH3_accumulate_512 XXH3_accumulate_512_vsx +#define XXH3_accumulate XXH3_accumulate_vsx +#define XXH3_scrambleAcc XXH3_scrambleAcc_vsx +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#elif (XXH_VECTOR == XXH_SVE) +#define XXH3_accumulate_512 XXH3_accumulate_512_sve +#define XXH3_accumulate XXH3_accumulate_sve +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#else /* scalar */ + +#define XXH3_accumulate_512 XXH3_accumulate_512_scalar +#define XXH3_accumulate XXH3_accumulate_scalar +#define XXH3_scrambleAcc XXH3_scrambleAcc_scalar +#define XXH3_initCustomSecret XXH3_initCustomSecret_scalar + +#endif + +#if XXH_SIZE_OPT >= 1 /* don't do SIMD for initialization */ +# undef XXH3_initCustomSecret +# define XXH3_initCustomSecret XXH3_initCustomSecret_scalar +#endif + +XXH_FORCE_INLINE void +XXH3_hashLong_internal_loop(xxh_u64* XXH_RESTRICT acc, + const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + size_t const nbStripesPerBlock = (secretSize - XXH_STRIPE_LEN) / XXH_SECRET_CONSUME_RATE; + size_t const block_len = XXH_STRIPE_LEN * nbStripesPerBlock; + size_t const nb_blocks = (len - 1) / block_len; + + size_t n; + + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + + for (n = 0; n < nb_blocks; n++) { + f_acc(acc, input + n*block_len, secret, nbStripesPerBlock); + f_scramble(acc, secret + secretSize - XXH_STRIPE_LEN); + } + + /* last partial block */ + XXH_ASSERT(len > XXH_STRIPE_LEN); + { size_t const nbStripes = ((len - 1) - (block_len * nb_blocks)) / XXH_STRIPE_LEN; + XXH_ASSERT(nbStripes <= (secretSize / XXH_SECRET_CONSUME_RATE)); + f_acc(acc, input + nb_blocks*block_len, secret, nbStripes); + + /* last stripe */ + { const xxh_u8* const p = input + len - XXH_STRIPE_LEN; +#define XXH_SECRET_LASTACC_START 7 /* not aligned on 8, last secret is different from acc & scrambler */ + XXH3_accumulate_512(acc, p, secret + secretSize - XXH_STRIPE_LEN - XXH_SECRET_LASTACC_START); + } } +} + +XXH_FORCE_INLINE xxh_u64 +XXH3_mix2Accs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret) +{ + return XXH3_mul128_fold64( + acc[0] ^ XXH_readLE64(secret), + acc[1] ^ XXH_readLE64(secret+8) ); +} + +static XXH64_hash_t +XXH3_mergeAccs(const xxh_u64* XXH_RESTRICT acc, const xxh_u8* XXH_RESTRICT secret, xxh_u64 start) +{ + xxh_u64 result64 = start; + size_t i = 0; + + for (i = 0; i < 4; i++) { + result64 += XXH3_mix2Accs(acc+2*i, secret + 16*i); +#if defined(__clang__) /* Clang */ \ + && (defined(__arm__) || defined(__thumb__)) /* ARMv7 */ \ + && (defined(__ARM_NEON) || defined(__ARM_NEON__)) /* NEON */ \ + && !defined(XXH_ENABLE_AUTOVECTORIZE) /* Define to disable */ + /* + * UGLY HACK: + * Prevent autovectorization on Clang ARMv7-a. Exact same problem as + * the one in XXH3_len_129to240_64b. Speeds up shorter keys > 240b. + * XXH3_64bits, len == 256, Snapdragon 835: + * without hack: 2063.7 MB/s + * with hack: 2560.7 MB/s + */ + XXH_COMPILER_GUARD(result64); +#endif + } + + return XXH3_avalanche(result64); +} + +#define XXH3_INIT_ACC { XXH_PRIME32_3, XXH_PRIME64_1, XXH_PRIME64_2, XXH_PRIME64_3, \ + XXH_PRIME64_4, XXH_PRIME32_2, XXH_PRIME64_5, XXH_PRIME32_1 } + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_internal(const void* XXH_RESTRICT input, size_t len, + const void* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, (const xxh_u8*)secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + /* do not align on 8, so that the secret is different from the accumulator */ +#define XXH_SECRET_MERGEACCS_START 11 + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + return XXH3_mergeAccs(acc, (const xxh_u8*)secret + XXH_SECRET_MERGEACCS_START, (xxh_u64)len * XXH_PRIME64_1); +} + +/* + * It's important for performance to transmit secret's size (when it's static) + * so that the compiler can properly optimize the vectorized loop. + * This makes a big performance difference for "medium" keys (<1 KB) when using AVX instruction set. + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_64b_internal(input, len, secret, secretLen, XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's preferable for performance that XXH3_hashLong is not inlined, + * as it results in a smaller function for small data, easier to the instruction cache. + * Note that inside this no_inline function, we do inline the internal loop, + * and provide a statically defined secret size to allow optimization of vector loop. + */ +XXH_NO_INLINE XXH_PUREF XXH64_hash_t +XXH3_hashLong_64b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_64b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * XXH3_hashLong_64b_withSeed(): + * Generate a custom key based on alteration of default XXH3_kSecret with the seed, + * and then use this key for long mode hashing. + * + * This operation is decently fast but nonetheless costs a little bit of time. + * Try to avoid it whenever possible (typically when seed==0). + * + * It's important for performance that XXH3_hashLong is not inlined. Not sure + * why (uop cache maybe?), but the difference is large and easily measurable. + */ +XXH_FORCE_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed_internal(const void* input, size_t len, + XXH64_hash_t seed, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ +#if XXH_SIZE_OPT <= 0 + if (seed == 0) + return XXH3_hashLong_64b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); +#endif + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed); + return XXH3_hashLong_64b_internal(input, len, secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH64_hash_t +XXH3_hashLong_64b_withSeed(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed, const xxh_u8* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_64b_withSeed_internal(input, len, seed, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + + +typedef XXH64_hash_t (*XXH3_hashLong64_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const xxh_u8* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH64_hash_t +XXH3_64bits_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong64_f f_hashLong) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secretLen` condition is not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + * Also, note that function signature doesn't offer room to return an error. + */ + if (len <= 16) + return XXH3_len_0to16_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_64b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hashLong(input, len, seed64, (const xxh_u8*)secret, secretLen); +} + + +/* === Public entry point === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits(XXH_NOESCAPE const void* input, size_t length) +{ + return XXH3_64bits_internal(input, length, 0, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecret(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_internal(input, length, 0, secret, secretSize, XXH3_hashLong_64b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSeed(XXH_NOESCAPE const void* input, size_t length, XXH64_hash_t seed) +{ + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), XXH3_hashLong_64b_withSeed); +} + +XXH_PUBLIC_API XXH64_hash_t +XXH3_64bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t length, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (length <= XXH3_MIDSIZE_MAX) + return XXH3_64bits_internal(input, length, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_64b_withSecret(input, length, seed, (const xxh_u8*)secret, secretSize); +} + + +/* === XXH3 streaming === */ +#ifndef XXH_NO_STREAM +/* + * Malloc's a pointer that is always aligned to align. + * + * This must be freed with `XXH_alignedFree()`. + * + * malloc typically guarantees 16 byte alignment on 64-bit systems and 8 byte + * alignment on 32-bit. This isn't enough for the 32 byte aligned loads in AVX2 + * or on 32-bit, the 16 byte aligned loads in SSE2 and NEON. + * + * This underalignment previously caused a rather obvious crash which went + * completely unnoticed due to XXH3_createState() not actually being tested. + * Credit to RedSpah for noticing this bug. + * + * The alignment is done manually: Functions like posix_memalign or _mm_malloc + * are avoided: To maintain portability, we would have to write a fallback + * like this anyways, and besides, testing for the existence of library + * functions without relying on external build tools is impossible. + * + * The method is simple: Overallocate, manually align, and store the offset + * to the original behind the returned pointer. + * + * Align must be a power of 2 and 8 <= align <= 128. + */ +static XXH_MALLOCF void* XXH_alignedMalloc(size_t s, size_t align) +{ + XXH_ASSERT(align <= 128 && align >= 8); /* range check */ + XXH_ASSERT((align & (align-1)) == 0); /* power of 2 */ + XXH_ASSERT(s != 0 && s < (s + align)); /* empty/overflow */ + { /* Overallocate to make room for manual realignment and an offset byte */ + xxh_u8* base = (xxh_u8*)XXH_malloc(s + align); + if (base != NULL) { + /* + * Get the offset needed to align this pointer. + * + * Even if the returned pointer is aligned, there will always be + * at least one byte to store the offset to the original pointer. + */ + size_t offset = align - ((size_t)base & (align - 1)); /* base % align */ + /* Add the offset for the now-aligned pointer */ + xxh_u8* ptr = base + offset; + + XXH_ASSERT((size_t)ptr % align == 0); + + /* Store the offset immediately before the returned pointer. */ + ptr[-1] = (xxh_u8)offset; + return ptr; + } + return NULL; + } +} +/* + * Frees an aligned pointer allocated by XXH_alignedMalloc(). Don't pass + * normal malloc'd pointers, XXH_alignedMalloc has a specific data layout. + */ +static void XXH_alignedFree(void* p) +{ + if (p != NULL) { + xxh_u8* ptr = (xxh_u8*)p; + /* Get the offset byte we added in XXH_malloc. */ + xxh_u8 offset = ptr[-1]; + /* Free the original malloc'd pointer */ + xxh_u8* base = ptr - offset; + XXH_free(base); + } +} +/*! @ingroup XXH3_family */ +/*! + * @brief Allocate an @ref XXH3_state_t. + * + * Must be freed with XXH3_freeState(). + * @return An allocated XXH3_state_t on success, `NULL` on failure. + */ +XXH_PUBLIC_API XXH3_state_t* XXH3_createState(void) +{ + XXH3_state_t* const state = (XXH3_state_t*)XXH_alignedMalloc(sizeof(XXH3_state_t), 64); + if (state==NULL) return NULL; + XXH3_INITSTATE(state); + return state; +} + +/*! @ingroup XXH3_family */ +/*! + * @brief Frees an @ref XXH3_state_t. + * + * Must be allocated with XXH3_createState(). + * @param statePtr A pointer to an @ref XXH3_state_t allocated with @ref XXH3_createState(). + * @return XXH_OK. + */ +XXH_PUBLIC_API XXH_errorcode XXH3_freeState(XXH3_state_t* statePtr) +{ + XXH_alignedFree(statePtr); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_copyState(XXH_NOESCAPE XXH3_state_t* dst_state, XXH_NOESCAPE const XXH3_state_t* src_state) +{ + XXH_memcpy(dst_state, src_state, sizeof(*dst_state)); +} + +static void +XXH3_reset_internal(XXH3_state_t* statePtr, + XXH64_hash_t seed, + const void* secret, size_t secretSize) +{ + size_t const initStart = offsetof(XXH3_state_t, bufferedSize); + size_t const initLength = offsetof(XXH3_state_t, nbStripesPerBlock) - initStart; + XXH_ASSERT(offsetof(XXH3_state_t, nbStripesPerBlock) > initStart); + XXH_ASSERT(statePtr != NULL); + /* set members from bufferedSize to nbStripesPerBlock (excluded) to 0 */ + memset((char*)statePtr + initStart, 0, initLength); + statePtr->acc[0] = XXH_PRIME32_3; + statePtr->acc[1] = XXH_PRIME64_1; + statePtr->acc[2] = XXH_PRIME64_2; + statePtr->acc[3] = XXH_PRIME64_3; + statePtr->acc[4] = XXH_PRIME64_4; + statePtr->acc[5] = XXH_PRIME32_2; + statePtr->acc[6] = XXH_PRIME64_5; + statePtr->acc[7] = XXH_PRIME32_1; + statePtr->seed = seed; + statePtr->useSeed = (seed != 0); + statePtr->extSecret = (const unsigned char*)secret; + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); + statePtr->secretLimit = secretSize - XXH_STRIPE_LEN; + statePtr->nbStripesPerBlock = statePtr->secretLimit / XXH_SECRET_CONSUME_RATE; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, XXH3_kSecret, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + if (statePtr == NULL) return XXH_ERROR; + XXH3_reset_internal(statePtr, 0, secret, secretSize); + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + if (statePtr == NULL) return XXH_ERROR; + if (seed==0) return XXH3_64bits_reset(statePtr); + if ((seed != statePtr->seed) || (statePtr->extSecret != NULL)) + XXH3_initCustomSecret(statePtr->customSecret, seed); + XXH3_reset_internal(statePtr, seed, NULL, XXH_SECRET_DEFAULT_SIZE); + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed64) +{ + if (statePtr == NULL) return XXH_ERROR; + if (secret == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; + XXH3_reset_internal(statePtr, seed64, secret, secretSize); + statePtr->useSeed = 1; /* always, even if seed64==0 */ + return XXH_OK; +} + +/*! + * @internal + * @brief Processes a large input for XXH3_update() and XXH3_digest_long(). + * + * Unlike XXH3_hashLong_internal_loop(), this can process data that overlaps a block. + * + * @param acc Pointer to the 8 accumulator lanes + * @param nbStripesSoFarPtr In/out pointer to the number of leftover stripes in the block* + * @param nbStripesPerBlock Number of stripes in a block + * @param input Input pointer + * @param nbStripes Number of stripes to process + * @param secret Secret pointer + * @param secretLimit Offset of the last block in @p secret + * @param f_acc Pointer to an XXH3_accumulate implementation + * @param f_scramble Pointer to an XXH3_scrambleAcc implementation + * @return Pointer past the end of @p input after processing + */ +XXH_FORCE_INLINE const xxh_u8 * +XXH3_consumeStripes(xxh_u64* XXH_RESTRICT acc, + size_t* XXH_RESTRICT nbStripesSoFarPtr, size_t nbStripesPerBlock, + const xxh_u8* XXH_RESTRICT input, size_t nbStripes, + const xxh_u8* XXH_RESTRICT secret, size_t secretLimit, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + const xxh_u8* initialSecret = secret + *nbStripesSoFarPtr * XXH_SECRET_CONSUME_RATE; + /* Process full blocks */ + if (nbStripes >= (nbStripesPerBlock - *nbStripesSoFarPtr)) { + /* Process the initial partial block... */ + size_t nbStripesThisIter = nbStripesPerBlock - *nbStripesSoFarPtr; + + do { + /* Accumulate and scramble */ + f_acc(acc, input, initialSecret, nbStripesThisIter); + f_scramble(acc, secret + secretLimit); + input += nbStripesThisIter * XXH_STRIPE_LEN; + nbStripes -= nbStripesThisIter; + /* Then continue the loop with the full block size */ + nbStripesThisIter = nbStripesPerBlock; + initialSecret = secret; + } while (nbStripes >= nbStripesPerBlock); + *nbStripesSoFarPtr = 0; + } + /* Process a partial block */ + if (nbStripes > 0) { + f_acc(acc, input, initialSecret, nbStripes); + input += nbStripes * XXH_STRIPE_LEN; + *nbStripesSoFarPtr += nbStripes; + } + /* Return end pointer */ + return input; +} + +#ifndef XXH3_STREAM_USE_STACK +# if XXH_SIZE_OPT <= 0 && !defined(__clang__) /* clang doesn't need additional stack space */ +# define XXH3_STREAM_USE_STACK 1 +# endif +#endif +/* + * Both XXH3_64bits_update and XXH3_128bits_update use this routine. + */ +XXH_FORCE_INLINE XXH_errorcode +XXH3_update(XXH3_state_t* XXH_RESTRICT const state, + const xxh_u8* XXH_RESTRICT input, size_t len, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + if (input==NULL) { + XXH_ASSERT(len == 0); + return XXH_OK; + } + + XXH_ASSERT(state != NULL); + { const xxh_u8* const bEnd = input + len; + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* For some reason, gcc and MSVC seem to suffer greatly + * when operating accumulators directly into state. + * Operating into stack space seems to enable proper optimization. + * clang, on the other hand, doesn't seem to need this trick */ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[8]; + XXH_memcpy(acc, state->acc, sizeof(acc)); +#else + xxh_u64* XXH_RESTRICT const acc = state->acc; +#endif + state->totalLen += len; + XXH_ASSERT(state->bufferedSize <= XXH3_INTERNALBUFFER_SIZE); + + /* small input : just fill in tmp buffer */ + if (len <= XXH3_INTERNALBUFFER_SIZE - state->bufferedSize) { + XXH_memcpy(state->buffer + state->bufferedSize, input, len); + state->bufferedSize += (XXH32_hash_t)len; + return XXH_OK; + } + + /* total input is now > XXH3_INTERNALBUFFER_SIZE */ + #define XXH3_INTERNALBUFFER_STRIPES (XXH3_INTERNALBUFFER_SIZE / XXH_STRIPE_LEN) + XXH_STATIC_ASSERT(XXH3_INTERNALBUFFER_SIZE % XXH_STRIPE_LEN == 0); /* clean multiple */ + + /* + * Internal buffer is partially filled (always, except at beginning) + * Complete it, then consume it. + */ + if (state->bufferedSize) { + size_t const loadSize = XXH3_INTERNALBUFFER_SIZE - state->bufferedSize; + XXH_memcpy(state->buffer + state->bufferedSize, input, loadSize); + input += loadSize; + XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, XXH3_INTERNALBUFFER_STRIPES, + secret, state->secretLimit, + f_acc, f_scramble); + state->bufferedSize = 0; + } + XXH_ASSERT(input < bEnd); + if (bEnd - input > XXH3_INTERNALBUFFER_SIZE) { + size_t nbStripes = (size_t)(bEnd - 1 - input) / XXH_STRIPE_LEN; + input = XXH3_consumeStripes(acc, + &state->nbStripesSoFar, state->nbStripesPerBlock, + input, nbStripes, + secret, state->secretLimit, + f_acc, f_scramble); + XXH_memcpy(state->buffer + sizeof(state->buffer) - XXH_STRIPE_LEN, input - XXH_STRIPE_LEN, XXH_STRIPE_LEN); + + } + /* Some remaining input (always) : buffer it */ + XXH_ASSERT(input < bEnd); + XXH_ASSERT(bEnd - input <= XXH3_INTERNALBUFFER_SIZE); + XXH_ASSERT(state->bufferedSize == 0); + XXH_memcpy(state->buffer, input, (size_t)(bEnd-input)); + state->bufferedSize = (XXH32_hash_t)(bEnd-input); +#if defined(XXH3_STREAM_USE_STACK) && XXH3_STREAM_USE_STACK >= 1 + /* save stack accumulators into state */ + XXH_memcpy(state->acc, acc, sizeof(acc)); +#endif + } + + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_64bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_update(state, (const xxh_u8*)input, len, + XXH3_accumulate, XXH3_scrambleAcc); +} + + +XXH_FORCE_INLINE void +XXH3_digest_long (XXH64_hash_t* acc, + const XXH3_state_t* state, + const unsigned char* secret) +{ + xxh_u8 lastStripe[XXH_STRIPE_LEN]; + const xxh_u8* lastStripePtr; + + /* + * Digest on a local copy. This way, the state remains unaltered, and it can + * continue ingesting more input afterwards. + */ + XXH_memcpy(acc, state->acc, sizeof(state->acc)); + if (state->bufferedSize >= XXH_STRIPE_LEN) { + /* Consume remaining stripes then point to remaining data in buffer */ + size_t const nbStripes = (state->bufferedSize - 1) / XXH_STRIPE_LEN; + size_t nbStripesSoFar = state->nbStripesSoFar; + XXH3_consumeStripes(acc, + &nbStripesSoFar, state->nbStripesPerBlock, + state->buffer, nbStripes, + secret, state->secretLimit, + XXH3_accumulate, XXH3_scrambleAcc); + lastStripePtr = state->buffer + state->bufferedSize - XXH_STRIPE_LEN; + } else { /* bufferedSize < XXH_STRIPE_LEN */ + /* Copy to temp buffer */ + size_t const catchupSize = XXH_STRIPE_LEN - state->bufferedSize; + XXH_ASSERT(state->bufferedSize > 0); /* there is always some input buffered */ + XXH_memcpy(lastStripe, state->buffer + sizeof(state->buffer) - catchupSize, catchupSize); + XXH_memcpy(lastStripe + catchupSize, state->buffer, state->bufferedSize); + lastStripePtr = lastStripe; + } + /* Last stripe */ + XXH3_accumulate_512(acc, + lastStripePtr, + secret + state->secretLimit - XXH_SECRET_LASTACC_START); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH64_hash_t XXH3_64bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + return XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + } + /* totalLen <= XXH3_MIDSIZE_MAX: digesting a short input */ + if (state->useSeed) + return XXH3_64bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_64bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ + + +/* ========================================== + * XXH3 128 bits (a.k.a XXH128) + * ========================================== + * XXH3's 128-bit variant has better mixing and strength than the 64-bit variant, + * even without counting the significantly larger output size. + * + * For example, extra steps are taken to avoid the seed-dependent collisions + * in 17-240 byte inputs (See XXH3_mix16B and XXH128_mix32B). + * + * This strength naturally comes at the cost of some speed, especially on short + * lengths. Note that longer hashes are about as fast as the 64-bit version + * due to it using only a slight modification of the 64-bit loop. + * + * XXH128 is also more oriented towards 64-bit machines. It is still extremely + * fast for a _128-bit_ hash on 32-bit (it usually clears XXH64). + */ + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_1to3_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + /* A doubled version of 1to3_64b with different constants. */ + XXH_ASSERT(input != NULL); + XXH_ASSERT(1 <= len && len <= 3); + XXH_ASSERT(secret != NULL); + /* + * len = 1: combinedl = { input[0], 0x01, input[0], input[0] } + * len = 2: combinedl = { input[1], 0x02, input[0], input[1] } + * len = 3: combinedl = { input[2], 0x03, input[0], input[1] } + */ + { xxh_u8 const c1 = input[0]; + xxh_u8 const c2 = input[len >> 1]; + xxh_u8 const c3 = input[len - 1]; + xxh_u32 const combinedl = ((xxh_u32)c1 <<16) | ((xxh_u32)c2 << 24) + | ((xxh_u32)c3 << 0) | ((xxh_u32)len << 8); + xxh_u32 const combinedh = XXH_rotl32(XXH_swap32(combinedl), 13); + xxh_u64 const bitflipl = (XXH_readLE32(secret) ^ XXH_readLE32(secret+4)) + seed; + xxh_u64 const bitfliph = (XXH_readLE32(secret+8) ^ XXH_readLE32(secret+12)) - seed; + xxh_u64 const keyed_lo = (xxh_u64)combinedl ^ bitflipl; + xxh_u64 const keyed_hi = (xxh_u64)combinedh ^ bitfliph; + XXH128_hash_t h128; + h128.low64 = XXH64_avalanche(keyed_lo); + h128.high64 = XXH64_avalanche(keyed_hi); + return h128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_4to8_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(4 <= len && len <= 8); + seed ^= (xxh_u64)XXH_swap32((xxh_u32)seed) << 32; + { xxh_u32 const input_lo = XXH_readLE32(input); + xxh_u32 const input_hi = XXH_readLE32(input + len - 4); + xxh_u64 const input_64 = input_lo + ((xxh_u64)input_hi << 32); + xxh_u64 const bitflip = (XXH_readLE64(secret+16) ^ XXH_readLE64(secret+24)) + seed; + xxh_u64 const keyed = input_64 ^ bitflip; + + /* Shift len to the left to ensure it is even, this avoids even multiplies. */ + XXH128_hash_t m128 = XXH_mult64to128(keyed, XXH_PRIME64_1 + (len << 2)); + + m128.high64 += (m128.low64 << 1); + m128.low64 ^= (m128.high64 >> 3); + + m128.low64 = XXH_xorshift64(m128.low64, 35); + m128.low64 *= PRIME_MX2; + m128.low64 = XXH_xorshift64(m128.low64, 28); + m128.high64 = XXH3_avalanche(m128.high64); + return m128; + } +} + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_9to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(input != NULL); + XXH_ASSERT(secret != NULL); + XXH_ASSERT(9 <= len && len <= 16); + { xxh_u64 const bitflipl = (XXH_readLE64(secret+32) ^ XXH_readLE64(secret+40)) - seed; + xxh_u64 const bitfliph = (XXH_readLE64(secret+48) ^ XXH_readLE64(secret+56)) + seed; + xxh_u64 const input_lo = XXH_readLE64(input); + xxh_u64 input_hi = XXH_readLE64(input + len - 8); + XXH128_hash_t m128 = XXH_mult64to128(input_lo ^ input_hi ^ bitflipl, XXH_PRIME64_1); + /* + * Put len in the middle of m128 to ensure that the length gets mixed to + * both the low and high bits in the 128x64 multiply below. + */ + m128.low64 += (xxh_u64)(len - 1) << 54; + input_hi ^= bitfliph; + /* + * Add the high 32 bits of input_hi to the high 32 bits of m128, then + * add the long product of the low 32 bits of input_hi and XXH_PRIME32_2 to + * the high 64 bits of m128. + * + * The best approach to this operation is different on 32-bit and 64-bit. + */ + if (sizeof(void *) < sizeof(xxh_u64)) { /* 32-bit */ + /* + * 32-bit optimized version, which is more readable. + * + * On 32-bit, it removes an ADC and delays a dependency between the two + * halves of m128.high64, but it generates an extra mask on 64-bit. + */ + m128.high64 += (input_hi & 0xFFFFFFFF00000000ULL) + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2); + } else { + /* + * 64-bit optimized (albeit more confusing) version. + * + * Uses some properties of addition and multiplication to remove the mask: + * + * Let: + * a = input_hi.lo = (input_hi & 0x00000000FFFFFFFF) + * b = input_hi.hi = (input_hi & 0xFFFFFFFF00000000) + * c = XXH_PRIME32_2 + * + * a + (b * c) + * Inverse Property: x + y - x == y + * a + (b * (1 + c - 1)) + * Distributive Property: x * (y + z) == (x * y) + (x * z) + * a + (b * 1) + (b * (c - 1)) + * Identity Property: x * 1 == x + * a + b + (b * (c - 1)) + * + * Substitute a, b, and c: + * input_hi.hi + input_hi.lo + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + * + * Since input_hi.hi + input_hi.lo == input_hi, we get this: + * input_hi + ((xxh_u64)input_hi.lo * (XXH_PRIME32_2 - 1)) + */ + m128.high64 += input_hi + XXH_mult32to64((xxh_u32)input_hi, XXH_PRIME32_2 - 1); + } + /* m128 ^= XXH_swap64(m128 >> 64); */ + m128.low64 ^= XXH_swap64(m128.high64); + + { /* 128x64 multiply: h128 = m128 * XXH_PRIME64_2; */ + XXH128_hash_t h128 = XXH_mult64to128(m128.low64, XXH_PRIME64_2); + h128.high64 += m128.high64 * XXH_PRIME64_2; + + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = XXH3_avalanche(h128.high64); + return h128; + } } +} + +/* + * Assumption: `secret` size is >= XXH3_SECRET_SIZE_MIN + */ +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_0to16_128b(const xxh_u8* input, size_t len, const xxh_u8* secret, XXH64_hash_t seed) +{ + XXH_ASSERT(len <= 16); + { if (len > 8) return XXH3_len_9to16_128b(input, len, secret, seed); + if (len >= 4) return XXH3_len_4to8_128b(input, len, secret, seed); + if (len) return XXH3_len_1to3_128b(input, len, secret, seed); + { XXH128_hash_t h128; + xxh_u64 const bitflipl = XXH_readLE64(secret+64) ^ XXH_readLE64(secret+72); + xxh_u64 const bitfliph = XXH_readLE64(secret+80) ^ XXH_readLE64(secret+88); + h128.low64 = XXH64_avalanche(seed ^ bitflipl); + h128.high64 = XXH64_avalanche( seed ^ bitfliph); + return h128; + } } +} + +/* + * A bit slower than XXH3_mix16B, but handles multiply by zero better. + */ +XXH_FORCE_INLINE XXH128_hash_t +XXH128_mix32B(XXH128_hash_t acc, const xxh_u8* input_1, const xxh_u8* input_2, + const xxh_u8* secret, XXH64_hash_t seed) +{ + acc.low64 += XXH3_mix16B (input_1, secret+0, seed); + acc.low64 ^= XXH_readLE64(input_2) + XXH_readLE64(input_2 + 8); + acc.high64 += XXH3_mix16B (input_2, secret+16, seed); + acc.high64 ^= XXH_readLE64(input_1) + XXH_readLE64(input_1 + 8); + return acc; +} + + +XXH_FORCE_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_17to128_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(16 < len && len <= 128); + + { XXH128_hash_t acc; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + +#if XXH_SIZE_OPT >= 1 + { + /* Smaller, but slightly slower. */ + unsigned int i = (unsigned int)(len - 1) / 32; + do { + acc = XXH128_mix32B(acc, input+16*i, input+len-16*(i+1), secret+32*i, seed); + } while (i-- != 0); + } +#else + if (len > 32) { + if (len > 64) { + if (len > 96) { + acc = XXH128_mix32B(acc, input+48, input+len-64, secret+96, seed); + } + acc = XXH128_mix32B(acc, input+32, input+len-48, secret+64, seed); + } + acc = XXH128_mix32B(acc, input+16, input+len-32, secret+32, seed); + } + acc = XXH128_mix32B(acc, input, input+len-16, secret, seed); +#endif + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_len_129to240_128b(const xxh_u8* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH64_hash_t seed) +{ + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize; + XXH_ASSERT(128 < len && len <= XXH3_MIDSIZE_MAX); + + { XXH128_hash_t acc; + unsigned i; + acc.low64 = len * XXH_PRIME64_1; + acc.high64 = 0; + /* + * We set as `i` as offset + 32. We do this so that unchanged + * `len` can be used as upper bound. This reaches a sweet spot + * where both x86 and aarch64 get simple agen and good codegen + * for the loop. + */ + for (i = 32; i < 160; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + i - 32, + seed); + } + acc.low64 = XXH3_avalanche(acc.low64); + acc.high64 = XXH3_avalanche(acc.high64); + /* + * NB: `i <= len` will duplicate the last 32-bytes if + * len % 32 was zero. This is an unfortunate necessity to keep + * the hash result stable. + */ + for (i=160; i <= len; i += 32) { + acc = XXH128_mix32B(acc, + input + i - 32, + input + i - 16, + secret + XXH3_MIDSIZE_STARTOFFSET + i - 160, + seed); + } + /* last bytes */ + acc = XXH128_mix32B(acc, + input + len - 16, + input + len - 32, + secret + XXH3_SECRET_SIZE_MIN - XXH3_MIDSIZE_LASTOFFSET - 16, + (XXH64_hash_t)0 - seed); + + { XXH128_hash_t h128; + h128.low64 = acc.low64 + acc.high64; + h128.high64 = (acc.low64 * XXH_PRIME64_1) + + (acc.high64 * XXH_PRIME64_4) + + ((len - seed) * XXH_PRIME64_2); + h128.low64 = XXH3_avalanche(h128.low64); + h128.high64 = (XXH64_hash_t)0 - XXH3_avalanche(h128.high64); + return h128; + } + } +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_internal(const void* XXH_RESTRICT input, size_t len, + const xxh_u8* XXH_RESTRICT secret, size_t secretSize, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble) +{ + XXH_ALIGN(XXH_ACC_ALIGN) xxh_u64 acc[XXH_ACC_NB] = XXH3_INIT_ACC; + + XXH3_hashLong_internal_loop(acc, (const xxh_u8*)input, len, secret, secretSize, f_acc, f_scramble); + + /* converge into final hash */ + XXH_STATIC_ASSERT(sizeof(acc) == 64); + XXH_ASSERT(secretSize >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)len * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + secretSize + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)len * XXH_PRIME64_2)); + return h128; + } +} + +/* + * It's important for performance that XXH3_hashLong() is not inlined. + */ +XXH_NO_INLINE XXH_PUREF XXH128_hash_t +XXH3_hashLong_128b_default(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; (void)secret; (void)secretLen; + return XXH3_hashLong_128b_internal(input, len, XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_accumulate, XXH3_scrambleAcc); +} + +/* + * It's important for performance to pass @p secretLen (when it's static) + * to the compiler, so that it can properly optimize the vectorized loop. + * + * When the secret size is unknown, or on GCC 12 where the mix of NO_INLINE and FORCE_INLINE + * breaks -Og, this is XXH_NO_INLINE. + */ +XXH3_WITH_SECRET_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSecret(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)seed64; + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, secretLen, + XXH3_accumulate, XXH3_scrambleAcc); +} + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed_internal(const void* XXH_RESTRICT input, size_t len, + XXH64_hash_t seed64, + XXH3_f_accumulate f_acc, + XXH3_f_scrambleAcc f_scramble, + XXH3_f_initCustomSecret f_initSec) +{ + if (seed64 == 0) + return XXH3_hashLong_128b_internal(input, len, + XXH3_kSecret, sizeof(XXH3_kSecret), + f_acc, f_scramble); + { XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + f_initSec(secret, seed64); + return XXH3_hashLong_128b_internal(input, len, (const xxh_u8*)secret, sizeof(secret), + f_acc, f_scramble); + } +} + +/* + * It's important for performance that XXH3_hashLong is not inlined. + */ +XXH_NO_INLINE XXH128_hash_t +XXH3_hashLong_128b_withSeed(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen) +{ + (void)secret; (void)secretLen; + return XXH3_hashLong_128b_withSeed_internal(input, len, seed64, + XXH3_accumulate, XXH3_scrambleAcc, XXH3_initCustomSecret); +} + +typedef XXH128_hash_t (*XXH3_hashLong128_f)(const void* XXH_RESTRICT, size_t, + XXH64_hash_t, const void* XXH_RESTRICT, size_t); + +XXH_FORCE_INLINE XXH128_hash_t +XXH3_128bits_internal(const void* input, size_t len, + XXH64_hash_t seed64, const void* XXH_RESTRICT secret, size_t secretLen, + XXH3_hashLong128_f f_hl128) +{ + XXH_ASSERT(secretLen >= XXH3_SECRET_SIZE_MIN); + /* + * If an action is to be taken if `secret` conditions are not respected, + * it should be done here. + * For now, it's a contract pre-condition. + * Adding a check and a branch here would cost performance at every hash. + */ + if (len <= 16) + return XXH3_len_0to16_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, seed64); + if (len <= 128) + return XXH3_len_17to128_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_len_129to240_128b((const xxh_u8*)input, len, (const xxh_u8*)secret, secretLen, seed64); + return f_hl128(input, len, seed64, secret, secretLen); +} + + +/* === Public XXH128 API === */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits(XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_128bits_internal(input, len, 0, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_default); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecret(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_128bits_internal(input, len, 0, + (const xxh_u8*)secret, secretSize, + XXH3_hashLong_128b_withSecret); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSeed(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_internal(input, len, seed, + XXH3_kSecret, sizeof(XXH3_kSecret), + XXH3_hashLong_128b_withSeed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH3_128bits_withSecretandSeed(XXH_NOESCAPE const void* input, size_t len, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + if (len <= XXH3_MIDSIZE_MAX) + return XXH3_128bits_internal(input, len, seed, XXH3_kSecret, sizeof(XXH3_kSecret), NULL); + return XXH3_hashLong_128b_withSecret(input, len, seed, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128(XXH_NOESCAPE const void* input, size_t len, XXH64_hash_t seed) +{ + return XXH3_128bits_withSeed(input, len, seed); +} + + +/* === XXH3 128-bit streaming === */ +#ifndef XXH_NO_STREAM +/* + * All initialization and update functions are identical to 64-bit streaming variant. + * The only difference is the finalization routine. + */ + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset(XXH_NOESCAPE XXH3_state_t* statePtr) +{ + return XXH3_64bits_reset(statePtr); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecret(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize) +{ + return XXH3_64bits_reset_withSecret(statePtr, secret, secretSize); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSeed(statePtr, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_reset_withSecretandSeed(XXH_NOESCAPE XXH3_state_t* statePtr, XXH_NOESCAPE const void* secret, size_t secretSize, XXH64_hash_t seed) +{ + return XXH3_64bits_reset_withSecretandSeed(statePtr, secret, secretSize, seed); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_128bits_update(XXH_NOESCAPE XXH3_state_t* state, XXH_NOESCAPE const void* input, size_t len) +{ + return XXH3_64bits_update(state, input, len); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t XXH3_128bits_digest (XXH_NOESCAPE const XXH3_state_t* state) +{ + const unsigned char* const secret = (state->extSecret == NULL) ? state->customSecret : state->extSecret; + if (state->totalLen > XXH3_MIDSIZE_MAX) { + XXH_ALIGN(XXH_ACC_ALIGN) XXH64_hash_t acc[XXH_ACC_NB]; + XXH3_digest_long(acc, state, secret); + XXH_ASSERT(state->secretLimit + XXH_STRIPE_LEN >= sizeof(acc) + XXH_SECRET_MERGEACCS_START); + { XXH128_hash_t h128; + h128.low64 = XXH3_mergeAccs(acc, + secret + XXH_SECRET_MERGEACCS_START, + (xxh_u64)state->totalLen * XXH_PRIME64_1); + h128.high64 = XXH3_mergeAccs(acc, + secret + state->secretLimit + XXH_STRIPE_LEN + - sizeof(acc) - XXH_SECRET_MERGEACCS_START, + ~((xxh_u64)state->totalLen * XXH_PRIME64_2)); + return h128; + } + } + /* len <= XXH3_MIDSIZE_MAX : short code */ + if (state->seed) + return XXH3_128bits_withSeed(state->buffer, (size_t)state->totalLen, state->seed); + return XXH3_128bits_withSecret(state->buffer, (size_t)(state->totalLen), + secret, state->secretLimit + XXH_STRIPE_LEN); +} +#endif /* !XXH_NO_STREAM */ +/* 128-bit utility functions */ + +#include <string.h> /* memcmp, memcpy */ + +/* return : 1 is equal, 0 if different */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_isEqual(XXH128_hash_t h1, XXH128_hash_t h2) +{ + /* note : XXH128_hash_t is compact, it has no padding byte */ + return !(memcmp(&h1, &h2, sizeof(h1))); +} + +/* This prototype is compatible with stdlib's qsort(). + * @return : >0 if *h128_1 > *h128_2 + * <0 if *h128_1 < *h128_2 + * =0 if *h128_1 == *h128_2 */ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API int XXH128_cmp(XXH_NOESCAPE const void* h128_1, XXH_NOESCAPE const void* h128_2) +{ + XXH128_hash_t const h1 = *(const XXH128_hash_t*)h128_1; + XXH128_hash_t const h2 = *(const XXH128_hash_t*)h128_2; + int const hcmp = (h1.high64 > h2.high64) - (h2.high64 > h1.high64); + /* note : bets that, in most cases, hash values are different */ + if (hcmp) return hcmp; + return (h1.low64 > h2.low64) - (h2.low64 > h1.low64); +} + + +/*====== Canonical representation ======*/ +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH128_canonicalFromHash(XXH_NOESCAPE XXH128_canonical_t* dst, XXH128_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH128_canonical_t) == sizeof(XXH128_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) { + hash.high64 = XXH_swap64(hash.high64); + hash.low64 = XXH_swap64(hash.low64); + } + XXH_memcpy(dst, &hash.high64, sizeof(hash.high64)); + XXH_memcpy((char*)dst + sizeof(hash.high64), &hash.low64, sizeof(hash.low64)); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH128_hash_t +XXH128_hashFromCanonical(XXH_NOESCAPE const XXH128_canonical_t* src) +{ + XXH128_hash_t h; + h.high64 = XXH_readBE64(src); + h.low64 = XXH_readBE64(src->digest + 8); + return h; +} + + + +/* ========================================== + * Secret generators + * ========================================== + */ +#define XXH_MIN(x, y) (((x) > (y)) ? (y) : (x)) + +XXH_FORCE_INLINE void XXH3_combine16(void* dst, XXH128_hash_t h128) +{ + XXH_writeLE64( dst, XXH_readLE64(dst) ^ h128.low64 ); + XXH_writeLE64( (char*)dst+8, XXH_readLE64((char*)dst+8) ^ h128.high64 ); +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API XXH_errorcode +XXH3_generateSecret(XXH_NOESCAPE void* secretBuffer, size_t secretSize, XXH_NOESCAPE const void* customSeed, size_t customSeedSize) +{ +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(secretBuffer != NULL); + XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); +#else + /* production mode, assert() are disabled */ + if (secretBuffer == NULL) return XXH_ERROR; + if (secretSize < XXH3_SECRET_SIZE_MIN) return XXH_ERROR; +#endif + + if (customSeedSize == 0) { + customSeed = XXH3_kSecret; + customSeedSize = XXH_SECRET_DEFAULT_SIZE; + } +#if (XXH_DEBUGLEVEL >= 1) + XXH_ASSERT(customSeed != NULL); +#else + if (customSeed == NULL) return XXH_ERROR; +#endif + + /* Fill secretBuffer with a copy of customSeed - repeat as needed */ + { size_t pos = 0; + while (pos < secretSize) { + size_t const toCopy = XXH_MIN((secretSize - pos), customSeedSize); + memcpy((char*)secretBuffer + pos, customSeed, toCopy); + pos += toCopy; + } } + + { size_t const nbSeg16 = secretSize / 16; + size_t n; + XXH128_canonical_t scrambler; + XXH128_canonicalFromHash(&scrambler, XXH128(customSeed, customSeedSize, 0)); + for (n=0; n<nbSeg16; n++) { + XXH128_hash_t const h128 = XXH128(&scrambler, sizeof(scrambler), n); + XXH3_combine16((char*)secretBuffer + n*16, h128); + } + /* last segment */ + XXH3_combine16((char*)secretBuffer + secretSize - 16, XXH128_hashFromCanonical(&scrambler)); + } + return XXH_OK; +} + +/*! @ingroup XXH3_family */ +XXH_PUBLIC_API void +XXH3_generateSecret_fromSeed(XXH_NOESCAPE void* secretBuffer, XXH64_hash_t seed) +{ + XXH_ALIGN(XXH_SEC_ALIGN) xxh_u8 secret[XXH_SECRET_DEFAULT_SIZE]; + XXH3_initCustomSecret(secret, seed); + XXH_ASSERT(secretBuffer != NULL); + memcpy(secretBuffer, secret, XXH_SECRET_DEFAULT_SIZE); +} + + + +/* Pop our optimization override from above */ +#if XXH_VECTOR == XXH_AVX2 /* AVX2 */ \ + && defined(__GNUC__) && !defined(__clang__) /* GCC, not Clang */ \ + && defined(__OPTIMIZE__) && XXH_SIZE_OPT <= 0 /* respect -O0 and -Os */ +# pragma GCC pop_options +#endif + +#endif /* XXH_NO_LONG_LONG */ + +#endif /* XXH_NO_XXH3 */ + +/*! + * @} + */ +#endif /* XXH_IMPLEMENTATION */ + + +#if defined (__cplusplus) +} /* extern "C" */ +#endif diff --git a/include/make/compiler.mk b/include/make/compiler.mk new file mode 100644 index 0000000..f251693 --- /dev/null +++ b/include/make/compiler.mk @@ -0,0 +1,42 @@ +# WARNING: Do not change cc-opt, cc-opt-alt or cc-warning without checking if +# clang bug #49364 is fixed. stderr is redirected to /dev/null on +# purpose, to work around a clang 11 bug that crashes if stderr is +# redirected to stdin. +# +# Function used to detect support of a given option by the compiler. +# Usage: CFLAGS += $(call cc-opt,option). Eg: $(call cc-opt,-fwrapv) +# Note: ensure the referencing variable is assigned using ":=" and not "=" to +# call it only once. +cc-opt = $(shell set -e; if $(CC) -Werror $(1) -E -xc - -o /dev/null </dev/null >&0 2>/dev/null; then echo "$(1)"; fi;) + +# same but tries with $2 if $1 is not supported +cc-opt-alt = $(if $(shell set -e; if $(CC) -Werror $(1) -E -xc - -o /dev/null </dev/null >&0 2>/dev/null; then echo 1;fi),$(1),$(call cc-opt,$(2))) + +# validate a list of options one at a time +cc-all-opts = $(foreach a,$(1),$(call cc-opt,$(a))) + +# try to pass plenty of options at once, take them on success or try them +# one at a time on failure and keep successful ones. This is handy to quickly +# validate most common options. +cc-all-fast = $(if $(call cc-opt,$(1)),$(1),$(call cc-all-opts,$(1))) + +# Below we verify that the compiler supports any -Wno-something option to +# disable any warning, or if a special option is needed to achieve that. This +# will allow to get rid of testing when the compiler doesn't care. The result +# is made of two variables: +# - cc-anywno that's non-empty if the compiler supports disabling anything +# - cc-wnouwo that may contain an option needed to enable this behavior +# Gcc 4.x and above do not need any option but will still complain about unknown +# options if another warning or error happens, and as such they're not testable. +# Clang needs a special option -Wno-unknown-warning-option. Compilers not +# supporting this option will check all warnings individually. +cc-anywno := $(call cc-opt,-Wno-haproxy-warning) +cc-wnouwo := $(if $(cc-anywno),,$(call cc-opt,-Wno-unknown-warning-option)) +cc-anywno := $(if $(cc-anywno)$(cc-wnouwo),1) + +# Disable a warning when supported by the compiler. Don't put spaces around the +# warning! And don't use cc-opt which doesn't always report an error until +# another one is also returned. If "cc-anywno" is set, the compiler supports +# -Wno- followed by anything so we don't even need to start the compiler. +# Usage: CFLAGS += $(call cc-nowarn,warning). Eg: $(call cc-opt,format-truncation) +cc-nowarn = $(if $(cc-anywno),-Wno-$(1),$(shell set -e; if $(CC) -Werror -W$(1) -E -xc - -o /dev/null </dev/null >&0 2>/dev/null; then echo "-Wno-$(1)"; fi;)) diff --git a/include/make/options.mk b/include/make/options.mk new file mode 100644 index 0000000..022981c --- /dev/null +++ b/include/make/options.mk @@ -0,0 +1,52 @@ +# this contains various functions and macros used to manipulate USE_* options +# and their flags + +# Depending on the target platform, some options are set, as well as some +# CFLAGS and LDFLAGS. All variables pre-set here will not appear in the build +# options string. They may be set to any value, but are historically set to +# "implicit" which eases debugging. You should not have to change anything +# there unless you're adding support for a new platform. +default_opts = $(foreach name,$(1),$(eval $(name)=implicit)) + +# Return USE_xxx=$(USE_xxx) if the variable was set from the environment or the +# command line. +ignore_implicit = $(if $(subst environment,,$(origin $(1))), \ + $(if $(subst command line,,$(origin $(1))),, \ + $(1)=$($(1))), \ + $(1)=$($(1))) \ + +# This macro collects all USE_* values except those set to "implicit". This +# is used to report a list of all flags which were used to build this version. +# Do not assign anything to it. +build_options = $(foreach opt,$(use_opts),$(call ignore_implicit,$(opt))) + +# Make a list of all known features with +/- prepended depending on their +# activation status. Must be a macro so that dynamically enabled ones are +# evaluated with their current status. +build_features = $(foreach opt,$(patsubst USE_%,%,$(sort $(use_opts))),$(if $(USE_$(opt)),+$(opt),-$(opt))) + +# This returns a list of -DUSE_* for all known USE_* that are set +opts_as_defines = $(foreach opt,$(use_opts),$(if $($(opt)),-D$(opt),)) + +# Lists all enabled or disabled options without the "USE_" prefix +enabled_opts = $(foreach opt,$(patsubst USE_%,%,$(use_opts)),$(if $(USE_$(opt)),$(opt),)) +disabled_opts = $(foreach opt,$(patsubst USE_%,%,$(use_opts)),$(if $(USE_$(opt)),,$(opt))) + +# preset all XXX_{INC,LIB,CFLAGS,LDFLAGS,SRC} variables to empty for $1=XXX +reset_opt_vars = $(foreach name,INC LIB CFLAGS LDFLAGS SRC,$(eval $(1)_$(name)=)) + +# preset all variables for all supported build options among use_opts +reset_opts_vars = $(foreach opt,$(patsubst USE_%,%,$(use_opts)),$(call reset_opt_vars,$(opt))) + +# append $(1)_{C,LD}FLAGS into OPTIONS_{C,LD}FLAGS if not empty +define collect_opt_flags + ifneq ($$($(1)_CFLAGS),) + OPTIONS_CFLAGS += $$($(1)_CFLAGS) + endif + ifneq ($$($(1)_LDFLAGS),) + OPTIONS_LDFLAGS += $$($(1)_LDFLAGS) + endif +endef + +# collect all enabled USE_foo's foo_{C,LD}FLAGS into OPTIONS_{C,LD}FLAGS +collect_opts_flags = $(foreach opt,$(enabled_opts),$(eval $(call collect_opt_flags,$(opt)))) diff --git a/include/make/verbose.mk b/include/make/verbose.mk new file mode 100644 index 0000000..c37d513 --- /dev/null +++ b/include/make/verbose.mk @@ -0,0 +1,30 @@ +# verbosity: pass V=1 for verbose shell invocation +V = 0 +Q = @ +ifeq ($V,1) +Q= +endif + +# Some common commands such as CC/LD/AR are redefined with a cmd_ equivalent +# and are either mapped to a silent rule just indicating what is being done, +# or to themselves depending on the verbosity level. +ifeq ($V,1) +cmd_CC = $(CC) +cmd_LD = $(LD) +cmd_AR = $(AR) +cmd_MAKE = +$(MAKE) +else +ifeq (3.81,$(firstword $(sort $(MAKE_VERSION) 3.81))) +# 3.81 or above +cmd_CC = $(info $ CC $@) $(Q)$(CC) +cmd_LD = $(info $ LD $@) $(Q)$(LD) +cmd_AR = $(info $ AR $@) $(Q)$(AR) +cmd_MAKE = $(info $ MAKE $@) $(Q)+$(MAKE) +else +# 3.80 or older +cmd_CC = $(Q)echo " CC $@";$(CC) +cmd_LD = $(Q)echo " LD $@";$(LD) +cmd_AR = $(Q)echo " AR $@";$(AR) +cmd_MAKE = $(Q)echo " MAKE $@";$(MAKE) +endif +endif |